diff --git a/.gitignore b/.gitignore
index ed131bdbbad6bd4dad500fa29f40a29fddeb7593..9823f8c945c1be8e717b622a993d402c49517b7c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -105,3 +105,5 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
 metal/MobileNetDemo/MobileNetDemo/Resources
+
+build*
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e3f7a211d70920aa74765b976af6939d55a328ab..377e58d3ac7c37271d2a813b22912528c556164b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,6 +61,7 @@ lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 lite_option(LITE_WITH_NPU  "Enable NPU in lite mode"  OFF)
 lite_option(LITE_WITH_XPU  "Enable XPU in lite mode"  OFF)
 lite_option(LITE_WITH_BM   "Enable BM in lite mode"   OFF)
+lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
 lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
 lite_option(LITE_WITH_OPENCL   "Enable OpenCL support in lite" OFF)
 lite_option(LITE_WITH_FPGA   "Enable FPGA support in lite" OFF)
@@ -76,6 +77,7 @@ lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
 # cv build options
 lite_option(LITE_WITH_CV  "Enable build cv image in lite" OFF)
 lite_option(LITE_WITH_STATIC_CUDA  "Statically link cuda libraries." ON)
+lite_option(LITE_WITH_ARM_CLANG "when arm lang is clang, its ON." OFF)
 
 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
 if(ANDROID OR IOS OR ARMLINUX)
@@ -130,7 +132,8 @@ endif()
 if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
     message(STATUS "Building the mobile framework")
     include(cross_compiling/postproject)
-    include(cross_compiling/npu) # check and prepare NPU DDK
+    include(device/npu) # check and prepare NPU DDK
+    include(device/xpu) # check and prepare XPU SDK
 
     # We compile the mobile deployment library when LITE_ON_TINY_PUBLISH=ON
     # So the following third party dependencies are not needed.
@@ -171,7 +174,7 @@ endif()
 ########################################################################################
 
 if(LITE_WITH_XPU)
-    include(xpu)
+    include(device/xpu)
 endif()
 
 include(external/mklml)     # download mklml package
diff --git a/README.md b/README.md
index 22b84888294b5ef60c3d91d7a7909aef8f601d81..b72e4bc9307ba9e12f1252455668bd07f80f6029 100644
--- a/README.md
+++ b/README.md
@@ -3,14 +3,14 @@
 # Paddle Lite
 
 <!--[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle-Lite.svg?branch=develop&longCache=true&style=flat-square)](https://travis-ci.org/PaddlePaddle/Paddle-Lite)-->
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.github.io/Paddle-Lite/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddle-lite.readthedocs.io/zh/latest/)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 <!-- [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Mobile.svg)](https://github.com/PaddlePaddle/Paddle-Mobile/releases) -->
 
 
 Paddle Lite is an updated version of Paddle-Mobile, an open-open source deep learning framework designed to make it easy to perform inference on mobile, embeded, and IoT devices. It is compatible with PaddlePaddle and pre-trained models from other sources.
 
-For tutorials, please see [PaddleLite Document](https://paddlepaddle.github.io/Paddle-Lite/).
+For tutorials, please see [PaddleLite Document](https://paddle-lite.readthedocs.io/zh/latest/).
 
 ## Key Features
 
diff --git a/README_cn.md b/README_cn.md
index 11d3967fe8ce88826ca982b71d96268c1a7e5c3a..4f5cd9254d42b4dc02035cb3ecfc8280b0e1c1ac 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -1,13 +1,13 @@
 #  Paddle Lite
 
 <!--[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle-Lite.svg?branch=develop&longCache=true&style=flat-square)](https://travis-ci.org/PaddlePaddle/Paddle-Lite)-->
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.github.io/Paddle-Lite/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddle-lite.readthedocs.io/zh/latest/)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 <!-- [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Mobile.svg)](https://github.com/PaddlePaddle/Paddle-Mobile/releases) -->
 
 Paddle Lite为Paddle-Mobile的升级版，定位支持包括手机移动端在内更多场景的轻量化高效预测，支持更广泛的硬件和平台，是一个高性能、轻量级的深度学习预测引擎。在保持和PaddlePaddle无缝对接外，也兼容支持其他训练框架产出的模型。
 
-完整使用文档位于 [PaddleLite 文档](https://paddlepaddle.github.io/Paddle-Lite/) 。
+完整使用文档位于 [PaddleLite 文档](https://paddle-lite.readthedocs.io/zh/latest/) 。
 
 ## 特性
 
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 752b22461d9d1c36b3ca6a0bfe472a5dcc3ab976..d38c78f62fa2bed4f4483355de0683f1f5b7656b 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -122,6 +122,9 @@ if (LITE_WITH_ARM)
     endif()
 endif()
 
+if (LITE_WITH_TRAIN)
+    add_definitions("-DLITE_WITH_TRAIN")
+endif()
 
 if (WITH_ARM_DOTPROD)
     add_definitions("-DWITH_ARM_DOTPROD")
diff --git a/cmake/cross_compiling/findar.cmake b/cmake/cross_compiling/findar.cmake
index bcb0dc70fd811a5041244dedb4a4bcf5b540dc3a..0f86231e49cdca274da27b596305144251a65f4b 100644
--- a/cmake/cross_compiling/findar.cmake
+++ b/cmake/cross_compiling/findar.cmake
@@ -23,7 +23,7 @@ endif()
 
 get_filename_component(AR_PATH ${CMAKE_CXX_COMPILER} PATH)
 
-find_file(AR_TOOL NAMES llvm-ar PATHS ${AR_PATH})
+find_file(AR_TOOL NAMES llvm-ar PATHS ${AR_PATH} NO_DEFAULT_PATH)
 
 if(NOT AR_TOOL)
     message(ERROR "Failed to find AR_TOOL in ${AR_PATH}")
diff --git a/cmake/cross_compiling/postproject.cmake b/cmake/cross_compiling/postproject.cmake
index 7466b3e6d438277ad31020f76665bf689df436f5..3db715ba74945d9e501637af5ef3086e4f11b294 100644
--- a/cmake/cross_compiling/postproject.cmake
+++ b/cmake/cross_compiling/postproject.cmake
@@ -57,10 +57,14 @@ function(check_linker_flag)
     endforeach()
     set(CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS} PARENT_SCOPE)
 endfunction()
+
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 if (LITE_ON_TINY_PUBLISH)
-    if(NOT LITE_WITH_PYTHON)
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
+    if((NOT LITE_WITH_PYTHON))
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
+    endif()
+    if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
     endif()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
diff --git a/cmake/cross_compiling/npu.cmake b/cmake/device/npu.cmake
similarity index 83%
rename from cmake/cross_compiling/npu.cmake
rename to cmake/device/npu.cmake
index c22bb1db4fbf8a7370ff3e7c9aca40cc94d550a2..88598f4690a157b20ac1873d84ad13c2f8652725 100644
--- a/cmake/cross_compiling/npu.cmake
+++ b/cmake/device/npu.cmake
@@ -17,15 +17,16 @@ if(NOT LITE_WITH_NPU)
 endif()
 
 if(NOT DEFINED NPU_DDK_ROOT)
-    set(NPU_DDK_ROOT $ENV{NPU_DDK_ROOT})
-    if(NOT NPU_DDK_ROOT)
-        message(FATAL_ERROR "Must set NPU_DDK_ROOT or env NPU_DDK_ROOT when LITE_WITH_NPU=ON")
-    endif()
+  set(NPU_DDK_ROOT $ENV{NPU_DDK_ROOT})
+  if(NOT NPU_DDK_ROOT)
+    message(FATAL_ERROR "Must set NPU_DDK_ROOT or env NPU_DDK_ROOT when LITE_WITH_NPU=ON")
+  endif()
 endif()
 
 message(STATUS "NPU_DDK_ROOT: ${NPU_DDK_ROOT}")
 find_path(NPU_DDK_INC NAMES HiAiModelManagerService.h
-  PATHS ${NPU_DDK_ROOT}/include NO_DEFAULT_PATH)
+  PATHS ${NPU_DDK_ROOT}/include
+  NO_DEFAULT_PATH)
 if(NOT NPU_DDK_INC)
   message(FATAL_ERROR "Can not find HiAiModelManagerService.h in ${NPU_DDK_ROOT}/include")
 endif()
@@ -34,21 +35,24 @@ include_directories("${NPU_DDK_ROOT}/include")
 
 set(NPU_SUB_LIB_PATH "lib64")
 if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
-    set(NPU_SUB_LIB_PATH "lib64")
+  set(NPU_SUB_LIB_PATH "lib64")
 endif()
 
 if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
-    set(NPU_SUB_LIB_PATH "lib")
+  set(NPU_SUB_LIB_PATH "lib")
 endif()
 
 find_library(NPU_DDK_HIAI_FILE NAMES hiai
-  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
+  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
+  NO_DEFAULT_PATH)
 
 find_library(NPU_DDK_IR_FILE NAMES hiai_ir
-  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
+  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
+  NO_DEFAULT_PATH)
 
 find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build
-  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
+  PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
+  NO_DEFAULT_PATH)
 
 if(NOT NPU_DDK_HIAI_FILE)
   message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}")
@@ -76,6 +80,3 @@ endif()
 
 set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs")
 set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs")
-
-
-
diff --git a/cmake/xpu.cmake b/cmake/device/xpu.cmake
similarity index 74%
rename from cmake/xpu.cmake
rename to cmake/device/xpu.cmake
index 2112f6b658f5f89b20d63c957cd0b979299c350b..099833ee4cf80968671036cffe89329506bbf091 100644
--- a/cmake/xpu.cmake
+++ b/cmake/device/xpu.cmake
@@ -17,15 +17,16 @@ if(NOT LITE_WITH_XPU)
 endif()
 
 if(NOT DEFINED XPU_SDK_ROOT)
-    set(XPU_SDK_ROOT $ENV{XPU_SDK_ROOT})
-    if(NOT XPU_SDK_ROOT)
-        message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON")
-    endif()
+  set(XPU_SDK_ROOT $ENV{XPU_SDK_ROOT})
+  if(NOT XPU_SDK_ROOT)
+    message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON")
+  endif()
 endif()
 
 message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}")
 find_path(XPU_SDK_INC NAMES xtcl.h
-  PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH)
+  PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl
+  NO_DEFAULT_PATH)
 if(NOT XPU_SDK_INC)
   message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
 endif()
@@ -34,7 +35,8 @@ include_directories("${XPU_SDK_ROOT}/XTCL/include")
 include_directories("${XPU_SDK_ROOT}/XTDK/include")
 
 find_library(XPU_SDK_XTCL_FILE NAMES xtcl
-  PATHS ${XPU_SDK_ROOT}/XTCL/so)
+  PATHS ${XPU_SDK_ROOT}/XTCL/so
+  NO_DEFAULT_PATH)
 
 if(NOT XPU_SDK_XTCL_FILE)
   message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
@@ -45,7 +47,8 @@ else()
 endif()
 
 find_library(XPU_SDK_TVM_FILE NAMES tvm
-  PATHS ${XPU_SDK_ROOT}/XTCL/so)
+  PATHS ${XPU_SDK_ROOT}/XTCL/so
+  NO_DEFAULT_PATH)
 
 if(NOT XPU_SDK_TVM_FILE)
   message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
@@ -56,7 +59,8 @@ else()
 endif()
 
 find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
+  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+  NO_DEFAULT_PATH)
 
 if(NOT XPU_SDK_XPU_API_FILE)
   message(FATAL_ERROR "Can not find XPU API Library in ${XPU_SDK_ROOT}")
@@ -67,7 +71,8 @@ else()
 endif()
 
 find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
+  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+  NO_DEFAULT_PATH)
 
 if(NOT XPU_SDK_XPU_RT_FILE)
   message(FATAL_ERROR "Can not find XPU RT Library in ${XPU_SDK_ROOT}")
@@ -78,18 +83,12 @@ else()
 endif()
 
 find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
-
-if(NOT XPU_SDK_XPU_JITC_FILE)
-  message(FATAL_ERROR "Can not find XPU JITC Library in ${XPU_SDK_ROOT}")
-else()
-  message(STATUS "Found XPU JITC Library: ${XPU_SDK_XPU_JITC_FILE}")
-  add_library(xpu_sdk_xpu_jitc SHARED IMPORTED GLOBAL)
-  set_property(TARGET xpu_sdk_xpu_jitc PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_JITC_FILE})
-endif()
+  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+  NO_DEFAULT_PATH)
 
 find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
+  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+  NO_DEFAULT_PATH)
 
 if(NOT XPU_SDK_LLVM_FILE)
   message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
@@ -99,7 +98,7 @@ else()
   set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
 endif()
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1 -D_GLIBCXX_USE_CXX11_ABI=0")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=0")
 
-set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
-set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
+set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
+set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
index fd40fa437b52ff33089b55c6cfb7df6604a0530d..265de3fbf68542f1b1525257887cbfaa4d1c4d62 100644
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -275,6 +275,11 @@ set(host_kernels CACHE INTERNAL "host kernels")
 
 set(kernels_src_list "${CMAKE_BINARY_DIR}/kernels_src_list.txt")
 file(WRITE ${kernels_src_list} "") # clean
+
+# file to record faked kernels for opt python lib
+set(fake_kernels_src_list "${CMAKE_BINARY_DIR}/fake_kernels_src_list.txt")
+file(WRITE ${fake_kernels_src_list} "") # clean
+
 if(LITE_BUILD_TAILOR)
   set(tailored_kernels_list_path "${LITE_OPTMODEL_DIR}/.tailored_kernels_source_list")
   file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
@@ -303,62 +308,74 @@ function(add_kernel TARGET device level)
         return()
     endif()
 
-    if (LITE_ON_MODEL_OPTIMIZE_TOOL)
-      # the source list will collect for model_optimize_tool to fake kernel generation.
-      foreach(src ${args_SRCS})
-          file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
-      endforeach()
-      return()
-    endif()
-
-    # when compiling the model_optimize_tool, a source file with all the fake kernel definitions will be generated,
-    # no need to continue the compilation of the true kernel source.
-    if (LITE_ON_MODEL_OPTIMIZE_TOOL)
-      return()
-    endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
-
 
     if ("${device}" STREQUAL "Host")
         set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
     if ("${device}" STREQUAL "ARM")
         if (NOT LITE_WITH_ARM)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
             return()
         endif()
         set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
     if ("${device}" STREQUAL "X86")
         if (NOT LITE_WITH_X86)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
+            return()
+        elseif (LITE_ON_MODEL_OPTIMIZE_TOOL)
+            foreach(src ${args_SRCS})
+                file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
             return()
         endif()
         set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
     if ("${device}" STREQUAL "NPU")
         if (NOT LITE_WITH_NPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
             return()
         endif()
         set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
     if ("${device}" STREQUAL "XPU")
         if (NOT LITE_WITH_XPU)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
             return()
         endif()
         set(xpu_kernels "${xpu_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
     if ("${device}" STREQUAL "FPGA")
         if (NOT LITE_WITH_FPGA)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
             return()
         endif()
         set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
     if ("${device}" STREQUAL "BM")
         if (NOT LITE_WITH_BM)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
             return()
         endif()
         set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
     endif()
     if ("${device}" STREQUAL "OPENCL")
         if (NOT LITE_WITH_OPENCL)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
             return()
         endif()
         set(opencl_kernels "${opencl_kernels};${TARGET}" CACHE INTERNAL "")
@@ -366,6 +383,9 @@ function(add_kernel TARGET device level)
 
     if ("${device}" STREQUAL "CUDA")
         if (NOT LITE_WITH_CUDA)
+            foreach(src ${args_SRCS})
+                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endforeach()
             return()
         endif()
         set(cuda_kernels "${cuda_kernels};${TARGET}" CACHE INTERNAL "")
diff --git a/cmake/mlu.cmake b/cmake/mlu.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..b73ab16462b83e952807289d511fdb95ad74c6cd
--- /dev/null
+++ b/cmake/mlu.cmake
@@ -0,0 +1,61 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_MLU)
+  return()
+endif()
+
+if(NOT DEFINED NEUWARE_HOME)
+    set(NEUWARE_HOME $ENV{NEUWARE_HOME})
+    if(NOT NEUWARE_HOME)
+        message(FATAL_ERROR "Must set NEUWARE_HOME or env NEUWARE_HOME when LITE_WITH_MLU=ON")
+    endif()
+endif()
+
+message(STATUS "LITE_WITH_MLU: ${LITE_WITH_MLU}")
+find_path(CNML_INC NAMES cnml.h
+  PATHS ${NEUWARE_HOME}/include NO_DEFAULT_PATH)
+if(NOT CNML_INC)
+  message(FATAL_ERROR "Can not find cnml.h in ${NEUWARE_HOME}/include")
+endif()
+
+find_path(CNRT_INC NAMES cnrt.h
+  PATHS ${NEUWARE_HOME}/include NO_DEFAULT_PATH)
+if(NOT CNRT_INC)
+  message(FATAL_ERROR "Can not find cnrt.h in ${NEUWARE_HOME}/include")
+endif()
+
+include_directories("${NEUWARE_HOME}/include")
+
+find_library(CNML_LIB_FILE NAMES cnml
+  PATHS ${NEUWARE_HOME}/lib64)
+
+if(NOT CNML_LIB_FILE)
+  message(FATAL_ERROR "Can not find CNML Library in ${NEUWARE_HOME}/lib64")
+else()
+  message(STATUS "Found CNML Library: ${CNML_LIB_FILE}")
+  add_library(cnml_lib SHARED IMPORTED GLOBAL)
+  set_property(TARGET cnml_lib PROPERTY IMPORTED_LOCATION ${CNML_LIB_FILE})
+endif()
+
+find_library(CNRT_LIB_FILE NAMES cnrt
+  PATHS ${NEUWARE_HOME}/lib64)
+
+if(NOT CNRT_LIB_FILE)
+  message(FATAL_ERROR "Can not find CNRT Library in ${NEUWARE_HOME}/lib64")
+else()
+  message(STATUS "Found CNRT Library: ${CNRT_LIB_FILE}")
+  add_library(cnrt_lib SHARED IMPORTED GLOBAL)
+  set_property(TARGET cnrt_lib PROPERTY IMPORTED_LOCATION ${CNRT_LIB_FILE})
+endif()
diff --git a/docs/advanced_user_guides/index.rst b/docs/advanced_user_guides/index.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/docs/api_reference/cv.md b/docs/api_reference/cv.md
new file mode 100644
index 0000000000000000000000000000000000000000..5110e40c423c39e33feb084fa0d09c89ddd13d16
--- /dev/null
+++ b/docs/api_reference/cv.md
@@ -0,0 +1,263 @@
+# CV图像预处理API
+
+请把编译脚本`Paddle-Lite/lite/too/build.sh`中`BUILD_CV`变量设置为`ON`， 其他编译参数设置请参考[源码编译](../user_guides/source_compile)， 以确保 Lite 可以正确编译。这样`CV`图像的加速库就会编译进去，且会生成`paddle_image_preprocess.h`的API文件
+
+- 硬件平台： `ARM`
+- 操作系统：`MAC` 和 `LINUX`
+
+## CV 图像预处理功能
+
+Lite 支持不同颜色空间的图像相互转换 `Convert` 、缩放 `Resize` 、翻转 `Flip`、旋转 `Rotate` 和图像数据转换为 `Tensor` 存储`ImageToTensor` 功能，下文将详细介绍每个功能的API接口。
+
+### CV 枚举变量和结构体变量
+
+- 颜色空间
+```cpp
+enum ImageFormat {
+  RGBA = 0,
+  BGRA,
+  RGB,
+  BGR,
+  GRAY,
+  NV21 = 11,
+  NV12,
+};
+```
+- 翻转参数
+```cpp
+enum FlipParam {
+  X = 0,  // flip along the X axis
+  Y,      // flip along the Y axis
+  XY      // flip along the XY axis
+};
+```
+- 转换参数
+```cpp
+typedef struct {
+  int ih;                // input height
+  int iw;                // input width
+  int oh;                // outpu theight
+  int ow;                // output width
+  FlipParam flip_param;  // flip, support x, y, xy
+  float rotate_param;    // rotate, support 90, 180, 270
+} TransParam;
+```
+
+### ImagePreprocess 类的成员变量
+
+`ImagePreprocess` 类含有以下三个私有成员变量，通过构造函数进行初始化。
+```cpp
+private:
+  ImageFormat srcFormat_; // input image color format
+  ImageFormat dstFormat_; // output image color format
+  TransParam transParam_; // image transform parameter
+
+// init
+ImagePreprocess::ImagePreprocess(ImageFormat srcFormat, ImageFormat dstFormat, TransParam param) {
+  this->srcFormat_ = srcFormat;
+  this->dstFormat_ = dstFormat;
+  this->transParam_ = param;
+}
+```
+
+### 颜色空间转换 Convert
+
+`Convert` 函数支持颜色空间：GRAY、NV12（NV21）、RGB（BGR）和RGBA（BGRA）
+
++ 目前支持以下颜色空间的相互转换：
+    - GRAY2BGR
+    - GRAY2RGB
+    - BGR2RGB
+    - BGRA2BGR
+    - BGRA2RGB
+    - RGBA2RGB
+    - RGBA2BGR
+    - BGRA2RGBA
+
++ 目前支持以下颜色空间的单向转换：
+    - NV12—BGR
+    - NV21—BGR
+    - NV12—RGB
+    - NV21—RGB
+    - NV12—BGRA
+    - NV21—BGRA
+    - NV12—RGBA
+    - NV21—RGBA
+
++ `Convert` 功能的API接口
+    ```cpp
+    // 方法一
+    void ImagePreprocess::imageCovert(const uint8_t* src, uint8_t* dst);
+    // 方法二
+    void ImagePreprocess::imageCovert(const uint8_t* src,
+    uint8_t* dst, ImageFormat srcFormat, ImageFormat dstFormat);
+    ```
+
+    + 第一个 `imageCovert` 接口，缺省参数来源于 `ImagePreprocess` 类的成员变量。故在初始化 `ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcFormat：`ImagePreprocess` 类的成员变量`srcFormat_`
+        - param dstFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+    
+    - 第二个`imageCovert` 接口，可以直接使用
+
+### 缩放 Resize
+
+`Resize` 功能支持颜色空间：GRAY、NV12（NV21）、RGB（BGR）和RGBA（BGRA）
+`Resize` 功能目前支持的方法：`bilinear`
+
++ `Resize` 功能的API接口
+    ```cpp
+    // 方法一
+    void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst);
+    // 方法二
+    void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst, ImageFormat srcFormat, ImageFormat srcFormat, int srcw, int srch, int dstw, int dsth);
+    ```
+
+    + 第一个`imageResize` 接口，缺省参数来源于`ImagePreprocess` 类的成员变量。故在初始化`ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+        - param srcw：`ImagePreprocess` 类的成员变量`transParam_.iw`
+        - param srch：`ImagePreprocess` 类的成员变量`transParam_.ih`
+        - param dstw：`ImagePreprocess` 类的成员变量`transParam_.ow`
+        - param dsth：`ImagePreprocess` 类的成员变量`transParam_.ow`
+    
+    - 第二个`imageResize` 接口，可以直接使用
+
+### 旋转 Rotate
+
+`Rotate` 功能支持颜色空间：GRAY、RGB（BGR）和RGBA（BGRA）
+`Rotate` 功能目前支持的角度：90、180 和 270
+
++ `Rotate` 功能的API接口
+    ```cpp
+    // 方法一
+    void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst);
+    // 方法二
+    void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst, ImageFormat srcFormat, ImageFormat srcFormat, int srcw, int srch, float degree);
+    ```
+
+    + 第一个`imageRotate` 接口，缺省参数来源于`ImagePreprocess` 类的成员变量。故在初始化`ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+        - param srcw：`ImagePreprocess` 类的成员变量`transParam_.ow`
+        - param srch：`ImagePreprocess` 类的成员变量`transParam_.oh`
+        - param degree：`ImagePreprocess` 类的成员变量`transParam_.rotate_param`
+    
+    - 第二个`imageRotate` 接口，可以直接使用
+
+### 翻转 Flip
+
+`Flip` 功能支持颜色空间：GRAY、RGB（BGR）和RGBA（BGRA）
+`Flip` 功能目前支持的功能：沿X轴翻转、沿Y轴翻转和沿XY轴翻转
+
++ `Flip` 功能的API接口
+    ```cpp
+    // 方法一
+    void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst);
+    // 方法二
+    void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst, ImageFormat srcFormat, ImageFormat srcFormat, int srcw, int srch, FlipParam flip_param);
+    ```
+
+    + 第一个`imageFlip` 接口，缺省参数来源于`ImagePreprocess` 类的成员变量。故在初始化`ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+        - param srcw：`ImagePreprocess` 类的成员变量`transParam_.ow`
+        - param srch：`ImagePreprocess` 类的成员变量`transParam_.oh`
+        - param flip_param：`ImagePreprocess` 类的成员变量`transParam_.flip_param`
+    
+    - 第二个`imageFlip` 接口，可以直接使用
+
+### Image2Tensor
+
+`Image2Tensor` 功能支持颜色空间：RGB（BGR）和RGBA（BGRA）
+`Image2Tensor` 功能目前支持的Layout：`NCHW`和 `NHWC`
+`Image2Tensor` 不仅完成图像转换为`Tensor`数据处理，而且还完成了图像数据的归一化处理
+
++ `Image2Tensor` 功能的API接口
+    ```cpp
+    // 方法一
+    void ImagePreprocess::image2Tensor(const uint8_t* src, Tensor* dstTensor, LayoutType layout, float* means, float* scales);
+    // 方法二
+    void ImagePreprocess::image2Tensor(const uint8_t* src, Tensor* dstTensor, ImageFormat srcFormat,  srcw, int srch, LayoutType layout, float* means, float* scales;
+    ```
+
+    + 第一个`image2Tensor` 接口，缺省参数来源于`ImagePreprocess` 类的成员变量。故在初始化`ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+        - param srcw：`ImagePreprocess` 类的成员变量`transParam_.ow`
+        - param srch：`ImagePreprocess` 类的成员变量`transParam_.oh`
+    
+    - 第二个`image2Tensor` 接口，可以直接使用
+
+
+
+## CV 图像预处理 Demo 示例
+
+例子：输入 `1920x1080` 大小的 `NV12` 图像src，输出 `960x540` 大小 `RGB` 格式的图像dst；然后，完成 `90` 度旋转和沿 `X` 轴翻转功能；最后，用 `NHWC` 格式存储在Tensor里。
+
+定义 `ImagePreprocess` 类的对象，初始化成员变量
+
+```cpp
+// init
+srcFormat = ImageFormat::NV12;
+dstFormat = ImageFormat::RGB;
+srch = 1920;
+srcw = 1080;
+dsth = 960;
+dstw = 540;
+flip_param = FlipParam::X;
+degree = 90;
+layout = LayoutType::NHWC
+// 方法一: 
+TransParam tparam;
+tparam.ih = srch;
+tparam.iw = srcw;
+tparam.oh = dsth;
+tparam.ow = dstw;
+tparam.flip_param = flip_param;
+tparam.rotate_param = degree;
+ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+// 方法二: 
+ImagePreprocess image_preprocess();
+```
+
+### imageConvert Demo
+
+```cpp
+// 方法一: 
+image_preprocess.imageCovert(src, lite_dst);
+// 方法二: 
+image_preprocess.imageCovert(src, lite_dst, (ImageFormat)srcFormat, (ImageFormat)dstFormat);
+```
+
+### imageResize Demo
+
+```cpp
+// 方法一: 
+image_preprocess.imageResize(lite_dst, resize_tmp);
+// 方法二: 
+image_preprocess.imageResize(lite_dst,resize_tmp, (ImageFormat)dstFormat, srcw,
+srch, dstw, dsth);
+```
+
+### imageRotate Demo
+
+```cpp
+// 方法一: 
+image_preprocess.imageRotate(resize_tmp, tv_out_ratote);
+// 方法二: 
+image_preprocess.imageRotate(resize_tmp,tv_out_ratote, (ImageFormat)dstFormat, dstw, dsth, degree);
+```
+
+### imageFlip Demo
+
+```cpp
+// 方法一: 
+image_preprocess.imageFlip(tv_out_ratote, tv_out_flip);
+// 方法二: 
+image_preprocess.imageFlip(tv_out_ratote, tv_out_flip, (ImageFormat)dstFormat， dstw, dsth, flip_param);
+```
+
+### image2Tensor Demo
+
+```cpp
+// 方法一: 
+image_preprocess.image2Tensor(tv_out_flip, &dst_tensor, layout, means, scales);
+// 方法二: 
+image_preprocess.image2Tensor(tv_out_flip, &dst_tensor,(ImageFormat)dstFormat, dstw, dsth, layout, means, scales);
+```
diff --git a/docs/api_reference/cxx_api_doc.md b/docs/api_reference/cxx_api_doc.md
index 38385a4267d5727d9c5c7d985d3457dd011e203c..0b0f1f3d9b321959ef1f6210010da69fc0ffc7b8 100644
--- a/docs/api_reference/cxx_api_doc.md
+++ b/docs/api_reference/cxx_api_doc.md
@@ -1,5 +1,5 @@
 
-# C++ API文档
+# C++ API
 
 ## CreatePaddlePredictor
 
@@ -260,14 +260,14 @@ class MobileConfig;
 
 `MobileConfig`用来配置构建轻量级PaddlePredictor的配置信息，如NaiveBuffer格式的模型地址、模型的内存地址(从内存加载模型时使用)、能耗模式、工作线程数等等。
 
-*注意：输入的模型需要使用[Model Optimize Tool](../model_optimize_tool)转化为NaiveBuffer格式的优化模型。*
+*注意：输入的模型需要使用[Model Optimize Tool](../user_guides/model_optimize_tool)转化为NaiveBuffer格式的优化模型。*
 
 示例：
 
 ```c++
 MobileConfig config;
 // 设置NaiveBuffer格式模型目录，从文件加载模型时使用
-config.set_model_dir(FLAGS_model_dir);
+config.set_model_from_file(<your_model_path>);
 // 设置工作线程数
 config.set_threads(4);
 // 设置能耗模式
@@ -277,13 +277,13 @@ config.set_power_mode(LITE_POWER_HIGH);
 std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
 ```
 
-### `set_model_from_file(model_dir)`
+### `set_model_from_file(model_file)`
 
 设置模型文件，当需要从磁盘加载模型时使用。
 
 参数：
 
-- `model_dir(std::string)` - 模型文件路径
+- `model_file(std::string)` - 模型文件路径
 
 返回：`None`
 
@@ -589,7 +589,7 @@ for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
 
 根据名称获取输出Tensor的指针。
 
-**注意**：`GetTensor`接口是为开发者设计的调试接口，可以输出[转化](../model_optimize_tool)后模型中的任一节点。如果出现`GetTensor(InputName)`返回值为空`Tensor`，可能原因是以该`InputName`命名的Tensor在模型转化的**子图融合**过程被融合替换了。
+**注意**：`GetTensor`接口是为开发者设计的调试接口，可以输出[转化](../user_guides/model_optimize_tool)后模型中的任一节点。如果出现`GetTensor(InputName)`返回值为空`Tensor`，可能原因是以该`InputName`命名的Tensor在模型转化的**子图融合**过程被融合替换了。
 
 参数：
 
diff --git a/docs/api_reference/index.rst b/docs/api_reference/index.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/docs/api_reference/java_api_doc.md b/docs/api_reference/java_api_doc.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ef8edb6e68daef0a86c04d7bb216106d36b26d5
--- /dev/null
+++ b/docs/api_reference/java_api_doc.md
@@ -0,0 +1,394 @@
+# Java API
+
+## MobileConfig
+
+```java
+public class MobileConfig extends ConfigBase;
+```
+
+`MobileConfig`用来配置构建轻量级PaddlePredictor的配置信息，如NaiveBuffer格式的模型地址、能耗模式、工作线程数等等。
+
+*注意：输入的模型需要使用Model Optimize Tool转化为NaiveBuffer格式的优化模型。*
+
+示例：
+
+```java
+MobileConfig config = new MobileConfig();
+// 设置NaiveBuffer格式模型目录
+config.setModelFromFile(modelfile);
+// 设置能耗模式
+config.setPowerMode(PowerMode.LITE_POWER_HIGH);
+// 设置工作线程数
+config.setThreads(1);
+
+// 根据MobileConfig创建PaddlePredictor
+PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config);
+```
+
+### ``setModelFromFile(model_file)``
+
+设置模型文件夹路径。
+
+参数：
+
+- `model_file(String)` - 模型文件路径
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### ``setModelDir(model_dir)``
+
+**注意**：Lite模型格式在release/v2.3.0之后修改，本接口为加载老格式模型的接口，将在release/v3.0.0废弃。建议替换为`setModelFromFile`接口。
+
+设置模型文件夹路径。
+
+参数：
+
+- `model_dir(String)` - 模型文件夹路径
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### ``setModelFromBuffer(model_buffer)``
+
+设置模型的内存数据，当需要从内存加载模型时使用。
+
+参数：
+
+- `model_buffer(str)` - 内存中的模型数据
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `getModelDir()`
+
+返回设置的模型文件夹路径。
+
+参数：
+
+- `None`
+
+返回：模型文件夹路径
+
+返回类型：`String`
+
+
+
+### `setPowerMode(mode)`
+
+设置CPU能耗模式。若不设置，则默认使用`LITE_POWER_HIGH`。
+
+*注意：只在开启`OpenMP`时生效，否则系统自动调度。*
+
+参数：
+
+- `mode(PowerMode)` - CPU能耗模式。
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `getPowerMode()`
+
+获取设置的CPU能耗模式。
+
+参数：
+
+- `None`
+
+返回：设置的CPU能耗模式
+
+返回类型：`PowerMode`
+
+
+
+### `setThreads(threads)`
+
+设置工作线程数。若不设置，则默认使用单线程。
+
+*注意：只在开启`OpenMP`的模式下生效，否则只使用单线程。*
+
+参数：
+
+- `threads(int)` - 工作线程数。默认为1。
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+### `getThreads()`
+
+获取设置的工作线程数。
+
+参数：
+
+- `None`
+
+返回：工作线程数
+
+返回类型：`int`
+
+## PaddlePredictor
+
+```java
+public class PaddlePredictor;
+```
+
+`PaddlePredictor`是Paddle-Lite的预测器。用户可以根据PaddlePredictor提供的接口使用MobileConfig创建新的预测器、设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。
+
+示例：
+
+```java
+// 设置MobileConfig
+MobileConfig config = new MobileConfig();
+config.setModelDir(modelPath);
+
+// 创建PaddlePredictor
+PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config);
+
+// 设置输入数据
+long[] dims = {100, 100};
+float[] inputBuffer = new float[10000];
+for (int i = 0; i < 10000; ++i) {
+    inputBuffer[i] = i;
+}
+Tensor input = predictor.getInput(0);
+input.resize(dims);
+input.setData(inputBuffer);
+
+// 执行预测
+predictor.run();
+
+// 获取输出数据
+Tensor output = predictor.getOutput(0);
+float[] output = result.getFloatData();
+for (int i = 0; i < 1000; ++i) {
+    System.out.println(output[i]);
+}
+```
+
+
+
+### `CreatePaddlePredictor(config)`
+
+```java
+public static PaddlePredictor createPaddlePredictor(ConfigBase config);
+```
+
+`CreatePaddlePredictor`用来根据`ConfigBase`动态创建预测器，目前Java API支持使用MobileConfig`。框架会根据您在config中指定的模型路径、能耗模型、工作线程数等自动创建一个预测器。
+
+参数：
+
+- `config(ConfigBase，目前应使用MobileConfig)` - 创建预测器的配置信息
+
+返回：根据config创建完成的预测器
+
+返回类型：`PaddlePredictor`
+
+
+
+### `getInput(index)`
+
+获取输入Tensor，用来设置模型的输入数据。
+
+参数：
+
+- `index(int)` - 输入Tensor的索引
+
+返回：第`index`个输入`Tensor`
+
+返回类型：`Tensor`
+
+
+
+### `getOutput(index)`
+
+获取输出Tensor，用来获取模型的输出结果。
+
+参数：
+
+- `index(int)` - 输出Tensor的索引
+
+返回：第`index`个输出Tensor
+
+返回类型：`Tensor`
+
+
+
+### `run()`
+
+执行模型预测，需要在***设置输入数据后***调用。
+
+参数：
+
+- `None`
+
+返回：预测执行状态，成功返回`true`，否则返回`false`
+
+返回类型：`boolean`
+
+
+
+### `getVersion()`
+
+用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息，如`v2.0-beta`；否则返回代码的`branch(commitid)`，如`develop(7e44619)`。
+
+参数：
+
+- `None`
+
+返回：当前lib使用的代码版本信息
+
+返回类型：`String`
+
+## PowerMode
+
+```java
+public enum PowerMode;
+```
+
+`PowerMode`为ARM CPU能耗模式，用户可以根据应用场景设置能耗模式获得最优的能效比。
+
+示例：
+
+```java
+MobileConfig config = new MobileConfig();
+// 设置NaiveBuffer格式模型目录
+config.setModelDir(modelPath);
+// 设置能耗模式
+config.setPowerMode(PowerMode.LITE_POWER_HIGH);
+
+// 根据MobileConfig创建PaddlePredictor
+PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config);
+```
+
+PowerMode详细说明如下：
+
+|         选项         | 说明                                                         |
+| :------------------: | ------------------------------------------------------------ |
+|   LITE_POWER_HIGH    | 绑定大核运行模式。如果ARM CPU支持big.LITTLE，则优先使用并绑定Big cluster。如果设置的线程数大于大核数量，则会将线程数自动缩放到大核数量。如果系统不存在大核或者在一些手机的低电量情况下会出现绑核失败，如果失败则进入不绑核模式。 |
+|    LITE_POWER_LOW    | 绑定小核运行模式。如果ARM CPU支持big.LITTLE，则优先使用并绑定Little cluster。如果设置的线程数大于小核数量，则会将线程数自动缩放到小核数量。如果找不到小核，则自动进入不绑核模式。 |
+|   LITE_POWER_FULL    | 大小核混用模式。线程数可以大于大核数量。当线程数大于核心数量时，则会自动将线程数缩放到核心数量。 |
+|  LITE_POWER_NO_BIND  | 不绑核运行模式（推荐）。系统根据负载自动调度任务到空闲的CPU核心上。 |
+| LITE_POWER_RAND_HIGH | 轮流绑定大核模式。如果Big cluster有多个核心，则每预测10次后切换绑定到下一个核心。 |
+| LITE_POWER_RAND_LOW  | 轮流绑定小核模式。如果Little cluster有多个核心，则每预测10次后切换绑定到下一个核心。 |
+
+
+
+## Tensor
+
+```c++
+public class Tensor;
+```
+
+Tensor是Paddle-Lite的数据组织形式，用于对底层数据进行封装并提供接口对数据进行操作，包括设置维度、数据等。
+
+*注意：用户应使用`PaddlePredictor`的`getInput`和`getOuput`接口获取输入/输出的`Tensor`。*
+
+示例：
+
+```java
+// 导入Java API
+import com.baidu.paddle.lite.MobileConfig;
+import com.baidu.paddle.lite.Tensor;
+import com.baidu.paddle.lite.Predictor;
+import com.baidu.paddle.lite.PowerMode;
+
+// 设置MobileConfig
+MobileConfig config = new MobileConfig();
+config.setModelDir(modelPath);
+
+// 创建PaddlePredictor
+PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config);
+
+// 设置输入数据
+long[] dims = {100, 100};
+float[] inputBuffer = new float[10000];
+for (int i = 0; i < 10000; ++i) {
+    inputBuffer[i] = i;
+}
+// 获取输入Tensor
+Tensor input = predictor.getInput(0);
+// 设置输入维度
+input.resize(dims);
+// 设置输入数据
+input.setData(inputBuffer);
+
+// 执行预测
+predictor.run();
+
+// 获取输出Tensor
+Tensor result = predictor.getOutput(0);
+// 获取输出数据
+float[] output = result.getFloatData();
+for (int i = 0; i < 1000; ++i) {
+    System.out.println(output[i]);
+}
+```
+
+### `resize(dims)`
+
+设置Tensor的维度信息。
+
+参数：
+
+- `dims(long[])` - 维度信息
+
+返回：设置成功返回`true`，否则返回`false`
+
+返回类型：`boolean`
+
+
+
+### `shape()`
+
+获取Tensor的维度信息。
+
+参数：
+
+- `None`
+
+返回：Tensor的维度信息
+
+返回类型：`long[]`
+
+
+
+### `setData(data)`
+
+设置Tensor数据。
+
+参数：
+
+- `data(float[])` - 需要设置的数据
+
+返回：成功则返回`true`，否则返回`false`
+
+返回类型：`boolean`
+
+
+
+### `getFloatData()`
+
+获取Tensor的底层float型数据。
+
+参数：
+
+- `None`
+
+返回：`Tensor`底层数据
+
+返回类型：`float[]`
diff --git a/docs/api_reference/python_api_doc.md b/docs/api_reference/python_api_doc.md
new file mode 100755
index 0000000000000000000000000000000000000000..b4c9e1715ccae9d194aa29fea30f41b3496ec0ae
--- /dev/null
+++ b/docs/api_reference/python_api_doc.md
@@ -0,0 +1,800 @@
+# Python API
+
+## create_paddle_predictor
+
+```python
+CxxPredictor create_paddle_predictor(config); # config为CxxConfig类型
+LightPredictor create_paddle_predictor(config); # config为MobileConfig类型
+```
+
+`create_paddle_predictor`函数用来根据`CxxConfig`或`MobileConfig`构建预测器。
+
+示例：
+
+```python
+from lite_core import *
+
+# 设置CxxConfig
+config = CxxConfig()
+config.set_model_dir(<your_model_dir_path>)
+places = [Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 根据CxxConfig创建CxxPredictor
+predictor = create_paddle_predictor(config)
+```
+
+参数：
+
+- `config(CxxConfig或MobileConfig)` - 用于构建Predictor的配置信息。
+
+返回：预测器`predictor`
+
+返回类型：`CxxPredictor`或`LightPredictor`
+
+## CxxConfig
+
+```python
+class CxxConfig;
+```
+
+`CxxConfig`用来配置构建CxxPredictor的配置信息，如protobuf格式的模型地址、能耗模式、工作线程数、place信息等等。
+
+示例：
+
+```python
+from lite_core import *
+
+config = CxxConfig()
+# 设置模型目录，加载非combined模型时使用
+config.set_model_dir(<your_model_dir_path>)
+# 设置工作线程数
+config.set_threads(4);
+# 设置能耗模式
+config.set_power_mode(PowerMode.LITE_POWER_NO_BIND)
+# 设置valid places
+places = [Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 根据CxxConfig创建CxxPredictor
+predictor = create_paddle_predictor(config)
+```
+
+### `set_model_dir(model_dir)`
+
+设置模型文件夹路径，当需要从磁盘加载非combined模型时使用。
+
+参数：
+
+- `model_dir(str)` - 模型文件夹路径
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `model_dir()`
+
+返回设置的模型文件夹路径。
+
+参数：
+
+- `None`
+
+返回：模型文件夹路径
+
+返回类型：`str`
+
+
+
+### `set_model_file(model_file)`
+
+设置模型文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `model_file(str)` - 模型文件路径
+
+返回类型：`None`
+
+
+
+### `model_file()`
+
+获取设置模型文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `None`
+
+返回：模型文件路径
+
+返回类型：`str`
+
+
+
+### `set_param_file(param_file)`
+
+设置模型参数文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `param_file(str)` - 模型文件路径
+
+返回类型：`None`
+
+
+
+### `param_file()`
+
+获取设置模型参数文件路径，加载combined形式模型时使用。
+
+参数：
+
+- `None`
+
+返回：模型参数文件路径
+
+返回类型：`str`
+
+
+
+### `set_valid_places(valid_places)`
+
+设置可用的places列表。
+
+参数：
+
+- `valid_places(list)` - 可用place列表。
+
+返回类型：`None`
+
+示例：
+
+```python
+from lite_core import *
+
+config = CxxConfig()
+# 设置模型目录，加载非combined模型时使用
+config.set_model_dir(<your_model_dir_path>)
+# 设置valid places
+# 注意，valid_places列表中Place的排序表明了用户对Place的偏好程度，如用户想优先使用ARM上Int8精度的
+# kernel，则应把Place(TargetType.ARM, PrecisionType.INT8)置于valid_places列表的首位。
+places = [Place(TargetType.ARM, PrecisionType.INT8),
+          Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 根据CxxConfig创建CxxPredictor
+predictor = create_paddle_predictor(config)
+```
+
+
+
+### `set_power_mode(mode)`
+
+设置CPU能耗模式。若不设置，则默认使用`PowerMode.LITE_POWER_HIGH`。
+
+*注意：只在开启`OpenMP`时生效，否则系统自动调度。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `mode(PowerMode)` - CPU能耗模式
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `power_mode()`
+
+获取设置的CPU能耗模式。
+
+*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `None`
+
+返回：设置的CPU能耗模式
+
+返回类型：`PowerMode`
+
+
+
+### `set_threads(threads)`
+
+设置工作线程数。若不设置，则默认使用单线程。
+
+*注意：只在开启`OpenMP`的模式下生效，否则只使用单线程。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `threads(int)` - 工作线程数
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `threads()`
+
+获取设置的工作线程数。
+
+*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `None`
+
+返回：工作线程数
+
+返回类型：`int`
+
+## MobileConfig
+
+```python
+class MobileConfig;
+```
+
+`MobileConfig`用来配置构建LightPredictor的配置信息，如NaiveBuffer格式的模型地址、能耗模式、工作线程数等等。
+
+示例：
+
+```python
+from lite_core import *
+
+config = MobileConfig()
+# 设置NaiveBuffer格式模型目录
+config.set_model_from_file(<your_model_path>)
+# 设置工作线程数
+config.set_threads(4);
+# 设置能耗模式
+config.set_power_mode(PowerMode.LITE_POWER_NO_BIND)
+
+# 根据MobileConfig创建LightPredictor
+predictor = create_paddle_predictor(config)
+```
+
+### `set_model_from_file(model_file)`
+
+**注意**：`model_file`应该是经过`opt`优化后产生的`NaiveBuffer`格式的模型。
+
+设置模型文件夹路径。
+
+参数：
+
+- `model_file(str)` - 模型文件路径
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `set_model_dir(model_dir)`
+
+**注意**：Lite模型格式在release/v2.3.0之后修改，本接口为加载老格式模型的接口，将在release/v3.0.0废弃。建议替换为`setModelFromFile`接口。`model_dir`应该是经过`Model Optimize Tool`优化后产生的`NaiveBuffer`格式的模型。
+
+设置模型文件夹路径。
+
+参数：
+
+- `model_dir(str)` - 模型文件夹路径
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `set_model_from_buffer(model_buffer)`
+
+设置模型的内存数据，当需要从内存加载模型时使用。
+
+参数：
+
+- `model_buffer(str)` - 内存中的模型数据
+
+返回：`None`
+
+返回类型：`void`
+
+
+
+
+### `model_dir()`
+
+返回设置的模型文件夹路径。
+
+参数：
+
+- `None`
+
+返回：模型文件夹路径
+
+返回类型：`str`
+
+
+
+### `set_power_mode(mode)`
+
+设置CPU能耗模式。若不设置，则默认使用`PowerMode.LITE_POWER_HIGH`。
+
+*注意：只在开启`OpenMP`时生效，否则系统自动调度。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `mode(PowerMode)` - CPU能耗模式
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `power_mode()`
+
+获取设置的CPU能耗模式。
+
+*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `None`
+
+返回：设置的CPU能耗模式
+
+返回类型：`PowerMode`
+
+
+
+### `set_threads(threads)`
+
+设置工作线程数。若不设置，则默认使用单线程。
+
+*注意：只在开启`OpenMP`的模式下生效，否则只使用单线程。此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `threads(int)` - 工作线程数
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `threads()`
+
+获取设置的工作线程数。
+
+*注意：此函数只在使用`LITE_WITH_ARM`编译选项下生效。*
+
+参数：
+
+- `None`
+
+返回：工作线程数
+
+返回类型：`int`
+
+## CxxPredictor
+
+```c++
+class CxxPredictor
+```
+
+`CxxPredictor`是Paddle-Lite的预测器，由`create_paddle_predictor`根据`CxxConfig`进行创建。用户可以根据CxxPredictor提供的接口设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。
+
+示例：
+
+```python
+from __future__ import print_function
+from lite_core import *
+
+# 1. 设置CxxConfig
+config = CxxConfig()
+if args.model_file != '' and args.param_file != '':
+    config.set_model_file(args.model_file)
+    config.set_param_file(args.param_file)
+else:
+    config.set_model_dir(args.model_dir)
+places = [Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 2. 创建CxxPredictor
+predictor = create_paddle_predictor(config)
+
+# 3. 设置输入数据
+input_tensor = predictor.get_input(0)
+input_tensor.resize([1, 3, 224, 224])
+input_tensor.set_float_data([1.] * 3 * 224 * 224)
+
+# 4. 运行模型
+predictor.run()
+
+# 5. 获取输出数据
+output_tensor = predictor.get_output(0)
+print(output_tensor.shape())
+print(output_tensor.float_data()[:10])
+```
+
+### `get_input(index)`
+
+获取输入Tensor，用来设置模型的输入数据。
+
+参数：
+
+- `index(int)` - 输入Tensor的索引
+
+返回：第`index`个输入`Tensor`
+
+返回类型：`Tensor`
+
+
+
+### `get_output(index)`
+
+获取输出Tensor，用来获取模型的输出结果。
+
+参数：
+
+- `index(int)` - 输出Tensor的索引
+
+返回：第`index`个输出`Tensor`
+
+返回类型：`Tensor`
+
+
+
+### `run()`
+
+执行模型预测，需要在***设置输入数据后***调用。
+
+参数：
+
+- `None`
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `get_version()`
+
+用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息，如`v2.0-beta`；否则返回代码的`branch(commitid)`，如`develop(7e44619)`。
+
+参数：
+
+- `None`
+
+返回：当前lib使用的代码版本信息
+
+返回类型：`str`
+
+## LightPredictor
+
+```c++
+class LightPredictor
+```
+
+`LightPredictor`是Paddle-Lite的预测器，由`create_paddle_predictor`根据`MobileConfig`进行创建。用户可以根据LightPredictor提供的接口设置输入数据、执行模型预测、获取输出以及获得当前使用lib的版本信息等。
+
+示例：
+
+```python
+from __future__ import print_function
+from lite_core import *
+
+# 1. 设置MobileConfig
+config = MobileConfig()
+config.set_model_dir(args.model_dir)
+
+# 2. 创建LightPredictor
+predictor = create_paddle_predictor(config)
+
+# 3. 设置输入数据
+input_tensor = predictor.get_input(0)
+input_tensor.resize([1, 3, 224, 224])
+input_tensor.set_float_data([1.] * 3 * 224 * 224)
+
+# 4. 运行模型
+predictor.run()
+
+# 5. 获取输出数据
+output_tensor = predictor.get_output(0)
+print(output_tensor.shape())
+print(output_tensor.float_data()[:10])
+```
+
+### `get_input(index)`
+
+获取输入Tensor，用来设置模型的输入数据。
+
+参数：
+
+- `index(int)` - 输入Tensor的索引
+
+返回：第`index`个输入`Tensor`
+
+返回类型：`Tensor`
+
+
+
+### `get_output(index)`
+
+获取输出Tensor，用来获取模型的输出结果。
+
+参数：
+
+- `index(int)` - 输出Tensor的索引
+
+返回：第`index`个输出`Tensor`
+
+返回类型：`Tensor`
+
+
+
+### `run()`
+
+执行模型预测，需要在***设置输入数据后***调用。
+
+参数：
+
+- `None`
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `get_version()`
+
+用于获取当前lib使用的代码版本。若代码有相应tag则返回tag信息，如`v2.0-beta`；否则返回代码的`branch(commitid)`，如`develop(7e44619)`。
+
+参数：
+
+- `None`
+
+返回：当前lib使用的代码版本信息
+
+返回类型：`str`
+
+## TargetType
+
+```python
+class TargetType;
+```
+`TargetType`为目标设备硬件类型，用户可以根据应用场景选择硬件平台类型。
+
+枚举型变量`TargetType`的所有可能取值包括：
+
+`{X86, CUDA, ARM, OpenCL, FPGA, NPU}`
+
+
+## PrecisionType
+```python
+class PrecisionType {FP32};
+```
+`PrecisionType`为模型中Tensor的数据精度，默认值为FP32(float32)。
+
+枚举型变量`PrecisionType`的所有可能取值包括：
+
+`{FP32, INT8, INT32, INT64}`
+
+
+
+
+## DataLayoutType
+
+```python
+class DataLayoutType {NCHW};
+```
+`DataLayoutType`为Tensor的数据格式，默认值为NCHW（number, channel, height, weigth）。
+
+枚举型变量`DataLayoutType`的所有可能取值包括：
+
+` {NCHW, NHWC}`
+
+
+
+## Place
+```python
+class Place{
+  TargetType target;
+  PrecisionType precision{FP32};
+  DataLayoutType layout{NCHW}
+}
+```
+`Place`是`TargetType`、`PrecisionType`和`DataLayoutType`的集合，说明运行时的设备类型、数据精度和数据格式。
+
+示例：
+```python
+from lite_core import *
+
+Place{TargetType(ARM), PrecisionType(FP32), DataLayoutType(NCHW)}
+```
+
+
+
+## PowerMode
+
+```python
+class PowerMode;
+```
+
+`PowerMode`为ARM CPU能耗模式，用户可以根据应用场景设置能耗模式获得最优的能效比。
+
+示例：
+
+```python
+from lite_core import *
+
+config = MobileConfig()
+# 设置NaiveBuffer格式模型目录
+config.set_model_dir(<your_model_dir_path>)
+# 设置能耗模式
+config.set_power_mode(PowerMode.LITE_POWER_NO_BIND)
+
+# 根据MobileConfig创建LightPredictor
+predictor = create_paddle_predictor(config)
+```
+
+PowerMode详细说明如下：
+
+|         选项         | 说明                                                         |
+| :------------------: | ------------------------------------------------------------ |
+|   LITE_POWER_HIGH    | 绑定大核运行模式。如果ARM CPU支持big.LITTLE，则优先使用并绑定Big cluster。如果设置的线程数大于大核数量，则会将线程数自动缩放到大核数量。如果系统不存在大核或者在一些手机的低电量情况下会出现绑核失败，如果失败则进入不绑核模式。 |
+|    LITE_POWER_LOW    | 绑定小核运行模式。如果ARM CPU支持big.LITTLE，则优先使用并绑定Little cluster。如果设置的线程数大于小核数量，则会将线程数自动缩放到小核数量。如果找不到小核，则自动进入不绑核模式。 |
+|   LITE_POWER_FULL    | 大小核混用模式。线程数可以大于大核数量。当线程数大于核心数量时，则会自动将线程数缩放到核心数量。 |
+|  LITE_POWER_NO_BIND  | 不绑核运行模式（推荐）。系统根据负载自动调度任务到空闲的CPU核心上。 |
+| LITE_POWER_RAND_HIGH | 轮流绑定大核模式。如果Big cluster有多个核心，则每预测10次后切换绑定到下一个核心。 |
+| LITE_POWER_RAND_LOW  | 轮流绑定小核模式。如果Little cluster有多个核心，则每预测10次后切换绑定到下一个核心。 |
+
+
+
+## Tensor
+
+```c++
+class Tensor
+```
+
+Tensor是Paddle-Lite的数据组织形式，用于对底层数据进行封装并提供接口对数据进行操作，包括设置Shape、数据、LoD信息等。
+
+*注意：用户应使用`CxxPredictor`或`LightPredictor`的`get_input`和`get_output`接口获取输入/输出的`Tensor`。*
+
+示例：
+
+```python
+from __future__ import print_function
+from lite_core import *
+
+# 1. 设置CxxConfig
+config = CxxConfig()
+if args.model_file != '' and args.param_file != '':
+    config.set_model_file(args.model_file)
+    config.set_param_file(args.param_file)
+else:
+    config.set_model_dir(args.model_dir)
+places = [Place(TargetType.ARM, PrecisionType.FP32)]
+config.set_valid_places(places)
+
+# 2. 创建CxxPredictor
+predictor = create_paddle_predictor(config)
+
+# 3. 设置输入数据
+input_tensor = predictor.get_input(0)
+input_tensor.resize([1, 3, 224, 224])
+input_tensor.set_float_data([1.] * 3 * 224 * 224)
+
+# 4. 运行模型
+predictor.run()
+
+# 5. 获取输出数据
+output_tensor = predictor.get_output(0)
+print(output_tensor.shape())
+print(output_tensor.float_data()[:10])
+```
+
+### `resize(shape)`
+
+设置Tensor的维度信息。
+
+参数：
+
+- `shape(list)` - 维度信息
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `shape()`
+
+获取Tensor的维度信息。
+
+参数：
+
+- `None`
+
+返回：Tensor的维度信息
+
+返回类型：`list`
+
+
+
+### `float_data()`
+
+获取Tensor的持有的float型数据。
+
+示例：
+
+```python
+output_tensor = predictor.get_output(0)
+print(output_tensor.shape())
+print(output_tensor.float_data()[:10])
+```
+
+参数：
+
+- `None`
+
+返回：`Tensor`持有的float型数据
+
+返回类型：`list`
+
+
+
+### `set_float_data(float_data)`
+
+设置Tensor持有float数据。
+
+示例：
+
+```python
+input_tensor = predictor.get_input(0)
+input_tensor.resize([1, 3, 224, 224])
+input_tensor.set_float_data([1.] * 3 * 224 * 224)
+```
+
+参数：
+
+- `float_data(list)` - 待设置的float型数据
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `set_lod(lod)`
+
+设置Tensor的LoD信息。
+
+参数：
+
+- `lod(list[list])` - Tensor的LoD信息
+
+返回：`None`
+
+返回类型：`None`
+
+
+
+### `lod()`
+
+获取Tensor的LoD信息
+
+参数：
+
+- `None`
+
+返回：`Tensor`的LoD信息
+
+返回类型：`list[list]`
diff --git a/docs/benchmark/benchmark.md b/docs/benchmark/benchmark.md
index efb0805fddc0bd62a2b21a130018edaa9213e0cf..2868d0e7e573d83a0fa804732c80744e566e78d3 100644
--- a/docs/benchmark/benchmark.md
+++ b/docs/benchmark/benchmark.md
@@ -1,4 +1,4 @@
-# Benchmark 数据
+# 性能数据
 
 可以参考[benchmark_tools](benchmark_tools)，推荐**一键benchmark**。
 
@@ -15,14 +15,12 @@
     * int8模型
         * mobilenet_v1
         * mobilenet_v2
-        * resnet50
 
 * 测试机器(android ndk ndk-r17c)
    *  骁龙855
       * xiaomi mi9, snapdragon 855 
       * 4xA76(1@2.84GHz + 3@2.4GHz) + 4xA55@1.78GHz
 
-
    *  骁龙845
       * xiaomi mi8, 845
       * 2.8GHz（大四核），1.7GHz（小四核）
@@ -30,20 +28,12 @@
    *  骁龙835
       * xiaomi mix2, snapdragon 835
       * 2.45GHz（大四核），1.9GHz（小四核）
- 
-   *  骁龙625
-      * oppo R9s, snapdragon625
-      * A53 x 8, big core@2.0GHz
- 
-   * 骁龙653
-      * 360 N5, snapdragon 653
-      * 4 x A73@2.0GHz + 4 x A53@1.4GHz
- 
+
    * 麒麟970
       * HUAWEI Mate10
  
 * 测试说明
-    * branch: release/2.0.0
+    * branch: release/v2.3.0
     * warmup=10, repeats=30，统计平均时间，单位是ms
     * 当线程数为1时，```DeviceInfo::Global().SetRunMode```设置LITE_POWER_HIGH，否者设置LITE_POWER_NO_BIND
     * 模型的输入图像的维度是{1, 3, 224, 224}，输入图像的每一位数值是1
@@ -55,78 +45,59 @@
 
 #### paddlepaddle model
 
-
 骁龙855|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 
-mobilenet_v1 |32.19 |18.81 |10.90 |30.92 |18.31 |10.15 
-mobilenet_v2 |22.91 |13.75 |8.64 |21.15 |12.79 |7.84 
-shufflenet_v2 |4.67 |3.37 |2.65 |4.43 |3.15 |2.66 
-squeezenet_v1.1 |25.10 |15.93 |9.68 |23.28 |14.61 |8.71 
-mnasnet |21.84 |13.14 |7.96 |19.61 |11.88 |7.55
+mobilenet_v1 |33.27 |19.52 |11.14 |31.72 |18.76 |10.24 |
+mobilenet_v2 |29.08 |15.79 |9.25 |25.89 |14.17 |8.38 |
+shufflenet_v2 |4.40 |3.09 |2.30 |4.28 |3.02 |2.35 |
+squeezenet_v1.1 |19.96 |12.61 |8.76 |18.25 |11.46 |7.97 |
+mnasnet |21.00 |12.54 |7.28 |19.65 |11.65 |6.96 |
 
 
-
-骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+骁龙845|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 
-mobilenet_v1 |94.13 |52.17 |30.68 |88.28 |47.58 |26.64 
-mobilenet_v2 |61.24 |34.64 |22.36 |56.66 |32.19 |19.63 
-shufflenet_v2 |10.87 |6.92 |5.12 |10.41 |6.76 |4.97 
-squeezenet_v1.1 |73.61 |42.25 |24.44 |64.87 |38.43 |23.06 
-mnasnet |58.22 |33.43 |20.44 |53.43 |30.20 |18.09 
-
+mobilenet_v1 |66.36 |35.97 |19.45 |62.66 |33.87 |17.85 |
+mobilenet_v2 |45.86 |25.53 |14.6 |41.58 |23.24 |13.39 |
+shufflenet_v2 |7.58 |4.89 |3.41 |7.44 |4.91 |3.58 |
+squeezenet_v1.1 |37.15 |22.74 |13.51 |34.69 |21.27 |12.74 |
+mnasnet |40.09 |21.73 |11.91 |38.19 |21.02 |12.11 |
 
-麒麟980|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
-----| ---- | ---- | ---- | ----  |----  |----
-threads num|1 |2 |4 |1 |2 |4 
-mobilenet_v1 |55.11 |28.24 |13.27 |34.24 |17.74 |12.41 
-mobilenet_v2 |37.03 |19.80 |51.94 |23.64 |12.98 |9.38 
-shufflenet_v2 |7.26 |4.94 |15.06 |5.32 |3.33 |2.82 
-squeezenet_v1.1 |42.73 |23.66 |57.39 |26.03 |14.53 |13.66 
-mnasnet |36.87 |20.15 |46.04 |21.85 |12.06 |8.68 
 
-麒麟970|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 
-mobilenet_v1 |97.80 |52.64 |34.46 |94.51 |49.36 |28.43 
-mobilenet_v2 |66.55 |38.52 |23.19 |62.89 |34.93 |21.53 
-shufflenet_v2 |13.78 |8.11 |5.93 |11.95 |7.90 |5.91 
-squeezenet_v1.1 |77.64 |43.67 |25.72 |69.91 |40.66 |24.62 
-mnasnet |61.86 |34.62 |22.68 |59.61 |32.79 |19.56 
+mobilenet_v1 |96.98 |53.92 |32.24 |89.31 |48.02 |27.58 |
+mobilenet_v2 |67.72 |37.66 |23.82 |60.10 |34.36 |21.05 |
+shufflenet_v2 |10.72 |6.62 |4.63 |10.10 |6.44 |4.63 |
+squeezenet_v1.1 |53.89 |33.28 |20.73 |50.83 |32.31 |19.51 |
+mnasnet |59.55 |33.53 |20.32 |56.21 |31.58 |19.06 |
 
 #### caffe model
 
 骁龙855|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |32.42 |18.68 |10.86 |30.92 |18.35 |10.07 |
-mobilenet_v2 |29.53 |17.76 |10.89 |27.19 |16.53 |9.75 |
-shufflenet_v2 |4.61 |3.29 |2.61 |4.36 |3.11 |2.51 |
-
-
-骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
-----| ---- | ---- | ---- | ----  |----  |----
-threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |92.52 |52.34 |30.37 |88.31 |49.75 |27.29 |
-mobilenet_v2 |79.50 |45.67 |28.79 |76.13 |44.01 |26.13 |
-shufflenet_v2 |10.94 |7.08 |5.16 |10.64 |6.83 |5.01 |
+mobilenet_v1 |33.36 |19.45 |11.26 |31.63 |18.74 |10.31 |
+mobilenet_v2 |31.63 |19.21 |11.61 |28.34 |17.14 |10.16 |
+shufflenet_v2 |4.46 |3.08 |2.32 |4.26 |2.98 |2.35 |
 
 
-麒麟980|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+骁龙845|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |55.36 |28.18 |13.31 |34.42 |17.93 |12.52 |
-mobilenet_v2 |49.17 |26.10 |65.49 |30.50 |16.66 |11.72 |
-shufflenet_v2 |8.45 |5.00 |15.65 |4.58 |3.14 |2.83 |
+mobilenet_v1 |66.32 |35.83 |19.56 |62.52 |33.79 |17.91 |
+mobilenet_v2 |58.46 |32.69 |18.56 |53.72 |29.86 |16.80 |
+shufflenet_v2 |7.65 |4.82 |3.46 |7.55 |4.97 |3.62 |
 
 
-麒麟970|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
-mobilenet_v1 |97.85 |53.38 |33.85 |94.29 |49.42 |28.29 |
-mobilenet_v2 |87.40 |50.25 |31.85 |85.55 |48.11 |28.24 |
-shufflenet_v2 |12.16 |8.39 |6.21 |12.21 |8.33 |6.32 |
+mobilenet_v1 |95.38 |54.09 |32.03 |95.05 |48.33 |27.54 |
+mobilenet_v2 |88.46 |48.98 |30.23 |79.28 |44.64 |27.10 |
+shufflenet_v2 |10.07 |6.51 |4.61 |10.31 |6.50 |4.66 |
 
 #### int8量化模型测试数据
 
@@ -136,6 +107,7 @@ threads num|1 |2 |4 |1 |2 |4 |
 mobilenet_v1 |36.80 |21.58 |11.12 | 14.01 |8.13 |4.32 |
 mobilenet_v2 |28.72 |19.08 |12.49 | 17.24 |11.55 |7.82 |
 
+
 骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
 ----| ---- | ---- | ---- | ----  |----  |----
 threads num|1 |2 |4 |1 |2 |4 |
diff --git a/docs/benchmark/benchmark_tools.md b/docs/benchmark/benchmark_tools.md
index 60341762b70772bc46196b836050714b9d43228b..3cf1486307ad79a47dfbfe199e3d6d708c99db4b 100644
--- a/docs/benchmark/benchmark_tools.md
+++ b/docs/benchmark/benchmark_tools.md
@@ -1,4 +1,4 @@
-# Benchmark 测试方法
+# 测试方法
 
 本文将会介绍，在**Ubuntu:16.04交叉编译环境**下，用安卓手机在终端测试Paddle-Lite的性能，并介绍两种Benchmark方法：
 
@@ -57,7 +57,7 @@ wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/bench
 
 #### 方式二：由源码编译benchmark_bin文件
 
-根据[源码编译](../source_compile)准备编译环境，拉取PaddleLite最新release发布版代码，并在仓库根目录下，执行：
+根据[源码编译](../user_guides/source_compile)准备编译环境，拉取PaddleLite最新release发布版代码，并在仓库根目录下，执行：
 
 ```shell
 ###########################################
@@ -135,53 +135,53 @@ sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt true
 > 不同手机，不同版本，测试模型的性能数据不同。
 
 ```shell
-run benchmark armv7
+run benchmark armv8
 --------------------------------------
 PaddleLite Benchmark
 Threads=1 Warmup=10 Repeats=30
--- mnasnet               avg = 159.8427 ms
--- mobilenet_v1          avg = 235.0072 ms
--- mobilenet_v2          avg = 173.0387 ms
--- shufflenet_v2         avg = 76.0040 ms
--- squeezenet_v11        avg = 164.2957 ms
+mnasnet                       min = 19.83500    max = 19.38500    average = 19.65503
+mobilenetv1                   min = 32.00600    max = 31.56900    average = 31.81983
+mobilenetv2                   min = 22.37900    max = 22.08700    average = 22.28623
+shufflenetv2                  min = 10.80400    max = 10.62900    average = 10.68890
+squeezenet                    min = 17.67400    max = 17.47900    average = 17.57677
 
 Threads=2 Warmup=10 Repeats=30
--- mnasnet               avg = 83.1287 ms
--- mobilenet_v1          avg = 121.6029 ms
--- mobilenet_v2          avg = 86.6175 ms
--- shufflenet_v2         avg = 41.5761 ms
--- squeezenet_v11        avg = 87.8678 ms
+mnasnet                       min = 11.85600    max = 11.72000    average = 11.77127
+mobilenetv1                   min = 18.75000    max = 18.64300    average = 18.70593
+mobilenetv2                   min = 14.05100    max = 13.59900    average = 13.71450
+shufflenetv2                  min = 6.67200     max = 6.58300     average = 6.63400
+squeezenet                    min = 12.07100    max = 11.33400    average = 11.41253
 
 Threads=4 Warmup=10 Repeats=30
--- mnasnet               avg = 73.3880 ms
--- mobilenet_v1          avg = 119.0739 ms
--- mobilenet_v2          avg = 85.3050 ms
--- shufflenet_v2         avg = 38.0762 ms
--- squeezenet_v11        avg = 64.2201 ms
+mnasnet                       min = 7.19300     max = 7.02600     average = 7.08480
+mobilenetv1                   min = 10.42000    max = 10.29100    average = 10.34267
+mobilenetv2                   min = 8.61900     max = 8.46900     average = 8.54707
+shufflenetv2                  min = 4.55200     max = 4.41900     average = 4.46477
+squeezenet                    min = 8.60000     max = 7.85200     average = 7.98407
 --------------------------------------
 
-run benchmark armv8
+run benchmark armv7
 --------------------------------------
 PaddleLite Benchmark
 Threads=1 Warmup=10 Repeats=30
--- mnasnet               avg = 165.3073 ms
--- mobilenet_v1          avg = 306.0188 ms
--- mobilenet_v2          avg = 195.1884 ms
--- shufflenet_v2         avg = 99.3692 ms
--- squeezenet_v11        avg = 156.6971 ms
+mnasnet                       min = 20.98300    max = 20.81400    average = 20.92527
+mobilenetv1                   min = 33.19000    max = 32.81700    average = 33.08490
+mobilenetv2                   min = 25.91400    max = 25.61700    average = 25.73097
+shufflenetv2                  min = 11.14300    max = 10.97600    average = 11.06757
+squeezenet                    min = 19.31800    max = 19.20000    average = 19.26530
 
 Threads=2 Warmup=10 Repeats=30
--- mnasnet               avg = 90.2290 ms
--- mobilenet_v1          avg = 157.0007 ms
--- mobilenet_v2          avg = 118.1607 ms
--- shufflenet_v2         avg = 68.6804 ms
--- squeezenet_v11        avg = 91.3090 ms
+mnasnet                       min = 12.59900    max = 12.46600    average = 12.52207
+mobilenetv1                   min = 19.05800    max = 18.94700    average = 18.97897
+mobilenetv2                   min = 15.28400    max = 15.11300    average = 15.19843
+shufflenetv2                  min = 6.97000     max = 6.81400     average = 6.90863
+squeezenet                    min = 12.87900    max = 12.12900    average = 12.22530
 
 Threads=4 Warmup=10 Repeats=30
--- mnasnet               avg = 179.9730 ms
--- mobilenet_v1          avg = 204.0684 ms
--- mobilenet_v2          avg = 181.6486 ms
--- shufflenet_v2         avg = 123.2728 ms
--- squeezenet_v11        avg = 412.9046 ms
+mnasnet                       min = 7.31400     max = 7.12900     average = 7.20357
+mobilenetv1                   min = 11.44000    max = 10.86900    average = 10.94383
+mobilenetv2                   min = 9.14900     max = 9.03800     average = 9.09907
+shufflenetv2                  min = 4.60600     max = 4.49400     average = 4.53360
+squeezenet                    min = 8.27000     max = 8.10600     average = 8.19000
 --------------------------------------
 ```
diff --git a/docs/benchmark/index.rst b/docs/benchmark/index.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/docs/demo_guides/android_app_demo.md b/docs/demo_guides/android_app_demo.md
new file mode 100644
index 0000000000000000000000000000000000000000..7c40e1eb52bec0112b98fac7b1c49ef79273089f
--- /dev/null
+++ b/docs/demo_guides/android_app_demo.md
@@ -0,0 +1,133 @@
+# Android Demo
+
+## 多种应用场景
+
+我们提供的Paddle-Lite示例工程[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)，其中包含[Android](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo)、[iOS](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-ios-demo)和[Armlinux](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-armlinux-demo)平台的示例工程。涵盖[人脸识别](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/face_detection_demo)、[人像分割](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/human_segmentation_demo)、[图像分类](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/image_classification_demo)、[目标检测](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/object_detection_demo)4个应用场景。
+
+### 1. 人脸识别
+
+人脸检测是Paddle-Lite提供的人像检测demo。在移动端上提供了高精度、实时的人脸检测能力，能处理基于人脸检测的业务场景。在移动端预测的效果图如下：
+
+<p align="center"><img width="300" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/face.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="300" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/face2.jpg"/></p>
+
+### 2. 人像分割
+
+人像分割是Paddle-Lite 提供的图像分割demo ，在移动端上提供了实时的人像分割能力，可以应用证件照自动抠图、面积测量、智能交通（标记车道和交通标志）等场景。  在移动端预测的效果图如下：
+
+<p align="center"><img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/human.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/human2.jpg"/></p>
+
+### 3. 图像分类
+
+图像分类是Paddle-Lite 提供的图像处理demo ，在移动端上提供了实时的物体识别能力，可以应用到生产线自动分拣或质检、识别医疗图像、辅助医生肉眼诊断等场景。在移动端预测的效果图如下：
+
+<p align="center"><img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/tabby_cat.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/tabby_cat2.jpg"/></p>
+
+### 4. 物体检测
+
+物体检测是Paddle-Lite 提供的图像识别demo ，在移动端上提供了检测多个物体的位置、名称、位置及数量的能力。可以应用到视频监控（是否有违规物体或行为）、工业质检（微小瑕疵的数量和位置）、医疗诊断（细胞计数、中药识别）等场景。在移动端预测的效果图如下：
+
+<p align="center"><img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/dog.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/dog2.jpg"/></p>
+
+## Android demo部署方法
+
+下面我们以 **目标检测示例（object_detection_demo)** 为例讲解如何部署。
+
+**目的**：将基于Paddle-Lite预测库的Android APP 部署到手机，实现物体检测
+
+**需要的环境**： Android Studio、Android手机（开启USB调试模式）、下载到本地的[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)工程
+
+**部署步骤**：
+
+1、 目标检测的Android示例位于 `Paddle-Lite-Demo\PaddleLite-android-demo\object_detection_demo`
+
+2、用Android Studio 打开object_detection_demo工程 （本步骤需要联网）。
+
+3、手机连接电脑，打开**USB调试**和**文件传输模式**，在Android Studio上连接自己的手机设备（手机需要开启允许从 USB安装软件权限）
+
+![Android_studio](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/Android_studio.png)
+
+4、按下 Run按钮，自动编译APP并安装到手机。(该过程会自动下载Paddle-Lite预测库和模型，需要联网)
+
+成功后效果如下，图一：APP安装到手机        图二： APP打开后的效果，会自动识别图片中的物体并标记
+
+<p align="center"><img width="300" height="450"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/AndroidApp0.png"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="300" height="450"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/AndroidApp1.jpg"/></p>
+
+## Android demo结构讲解
+
+Android 示例的代码结构如下图所示：
+
+<p align="center"><img width="600" height="450"  src="http://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/Android_struct.png"/>
+
+
+   1、 Predictor.java： 预测代码
+
+```shell
+# 位置：
+object_detection_demo/app/src/main/java/com/baidu/paddle/lite/demo/object_detection/Predictor.java
+```
+
+  2、 model.nb : 模型文件 (opt 工具转化后Paddle-Lite模型)；pascalvoc_label_list：训练模型时的`labels`文件
+
+```shell
+# 位置：
+object_detection_demo/app/src/main/assets/models/ssd_mobilenet_v1_pascalvoc_for_cpu/model.nb
+object_detection_demo/app/src/main/assets/labels/pascalvoc_label_list
+```
+
+  3、 libpaddle_lite_jni.so、PaddlePredictor.jar：Paddle-Lite Java 预测库与Jar包 
+
+```shell
+# 位置
+object_detection_demo/app/src/main/jniLibs/arm64-v8a/libpaddle_lite_jni.so
+object_detection_demo/app/libs/PaddlePredictor.jar
+```
+
+  4、 build.gradle : 定义编译过程的 gradle 脚本。（不用改动，定义了自动下载Paddle-Lite预测和模型的过程）
+
+```shell
+# 位置
+object_detection_demo/app/build.gradle
+```
+
+
+
+## 代码讲解 （使用Paddle-Lite Java API 执行预测）
+
+Android 示例基于Java API 开发，调用Paddle-Lite Java API包括以下五步。更详细的API 描述参考： [Paddle-Lite Java API](https://paddle-lite.readthedocs.io/zh/latest/api_reference/java_api_doc.html)。
+
+```c++
+// 导入Java API
+import com.baidu.paddle.lite.MobileConfig;
+import com.baidu.paddle.lite.Tensor;
+import com.baidu.paddle.lite.Predictor;
+import com.baidu.paddle.lite.PowerMode;
+
+// 1. 写入配置：设置MobileConfig
+MobileConfig config = new MobileConfig();
+config.setModelFromFile(<modelPath>); // 设置Paddle-Lite模型路径
+config.setPowerMode(PowerMode.LITE_POWER_NO_BIND); // 设置CPU运行模式
+config.setThreads(4); // 设置工作线程数
+
+// 2. 创建 PaddlePredictor
+PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config);
+
+// 3. 设置输入数据
+long[] dims = {100, 100};
+float[] inputBuffer = new float[10000];
+for (int i = 0; i < 10000; ++i) {
+    inputBuffer[i] = i;
+}
+Tensor input = predictor.getInput(0);
+input.resize(dims);
+input.setData(inputBuffer);
+
+// 4. 执行预测
+predictor.run();
+
+// 5. 获取输出数据
+Tensor result = predictor.getOutput(0);
+float[] output = result.getFloatData();
+for (int i = 0; i < 1000; ++i) {
+    System.out.println(output[i]);
+}
+```
diff --git a/docs/demo_guides/cpp_demo.md b/docs/demo_guides/cpp_demo.md
new file mode 100644
index 0000000000000000000000000000000000000000..55abd3a70fe23dd0e8798d6a772ee216140c2875
--- /dev/null
+++ b/docs/demo_guides/cpp_demo.md
@@ -0,0 +1,266 @@
+# C++ Demo
+
+## 1. 下载最新版本预测库
+
+预测库下载界面位于[Paddle-Lite官方预编译库](../user_guides/release_lib)，可根据需求选择合适版本。
+
+以**Android-ARMv8架构**为例，可以下载以下版本：
+
+
+|ARM Version|build_extra|arm_stl|target|下载|
+|:-------:|:-----:|:-----:|:-----:|:-------:|
+|armv8|OFF|c++_static|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_static.tiny_publish.tar.gz)|
+
+**解压后内容如下图所示：**
+
+![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/1inference_lib.png)
+
+## 2. 转化模型
+
+PaddlePaddle的原生模型需要经过[opt]()工具转化为Paddle-Lite可以支持的naive_buffer格式。
+
+以`mobilenet_v1`模型为例：
+
+（1）下载[mobilenet_v1模型](http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz)后解压：
+
+```shell
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxf mobilenet_v1.tar.gz
+```
+
+**如下图所示:**
+
+![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/3inference_model.png)
+
+（2）下载[opt工具](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt)。放入同一文件夹，终端输入命令转化模型：
+
+```shell
+wget https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/opt
+chmod +x opt
+./opt --model_dir=./mobilenet_v1 --optimize_out_type=naive_buffer   --optimize_out=./mobilenet_v1_opt
+```
+
+**结果如下图所示：**
+
+![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/2opt_model.png)
+
+
+
+## 3. 编写预测程序
+
+准备好预测库和模型，我们便可以编写程序来执行预测。我们提供涵盖图像分类、目标检测等多种应用场景的C++示例demo可供参考，位于`inference_lite_lib.android.armv8/demo/cxx`。
+
+以mobile net_v1预测为例：`mobile_light`为mobilenet_v1预测示例，可以直接调用。
+
+**示例如下图所示：**
+
+![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/4light_demo.png)
+
+
+
+## 4. 编译
+
+预测程序需要编译为Android可执行文件。
+
+以mobilenet_v1模型为例，C++示例位于`inference_lite_lib.android.armv8/demo/mobile_light`
+
+```shell
+cd inference_lite_lib.android.armv8/demo/mobile_light
+```
+
+编译demo
+
+```shell
+make
+```
+
+**结果如下图所示：**
+
+![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/5compile_demo.png)
+
+## 5. 执行预测
+
+通过adb工具将可执行文件推送到手机上执行预测
+
+（1）保证电脑已经安装adb工具，手机以"USB调试"、"文件传输模式"连接到电脑。
+
+``` shell
+adb deveices   #查看adb设备是否已被识别
+```
+
+**连接如下图所示：**
+
+![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/6adb_devices.png)
+
+（2）准备预测库、模型和预测文件
+
+1、将模型、动态库和预测文件放入同一文件夹：
+
+![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/7files.png)
+
+**注意**：动态预测库文件位于: `inference_lite_lib.android.armv8/cxx/liblibpaddle_light_api_shared.so`
+
+2、文件推送到手机：
+
+``` shell
+chmod +x mobilenetv1_light_api
+adb push mobilenet_v1_opt.nb /data/local/tmp
+adb push libpaddle_light_api_shared.so /data/local/tmp
+adb push mobilenetv1_light_api /data/local/tmp
+```
+**效果如下图所示：**
+
+![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/8push_file.png)
+
+（3）执行预测
+
+```shell
+adb shell 'cd /data/local/tmp && export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp && mobilenetv1_light_api ./mobilenet_v1_opt.nb'
+```
+**结果如下图所示：**
+
+![image](https://paddlelite-data.bj.bcebos.com/doc_images/cxx_demo/9result.png)
+
+上图的`Output`为mobilenet_v1模型在全1输入时，得到的预测输出。至此，Paddle-Lite的C++ demo执行完毕。
+
+
+
+
+
+## 注：如何在代码中使用 API
+
+C++代码调用Paddle-Lite执行预测库仅需以下五步：
+
+（1）引用头文件和命名空间
+
+```c++
+#include "paddle_api.h"
+using namespace paddle::lite_api;
+```
+
+（2）指定模型文件，创建Predictor
+
+```C++
+// 1. Set MobileConfig, model_file_path is 
+// the path to model model file. 
+MobileConfig config;
+config.set_model_from_file(model_file_path);
+// 2. Create PaddlePredictor by MobileConfig
+std::shared_ptr<PaddlePredictor> predictor =
+    CreatePaddlePredictor<MobileConfig>(config);
+```
+
+（3）设置模型输入 (下面以全一输入为例)
+
+```c++
+std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+input_tensor->Resize({1, 3, 224, 224});
+auto* data = input_tensor->mutable_data<float>();
+for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+  data[i] = 1;
+}
+```
+
+（4）执行预测
+
+```c++
+predictor->Run();
+```
+
+（5）获得预测结果
+
+```c++
+std::unique_ptr<const Tensor> output_tensor(
+    std::move(predictor->GetOutput(0)));
+// 转化为数据
+auto output_data=output_tensor->data<float>();
+```
+
+
+
+
+
+## 其他cxx_demo的编译与预期结果
+
+### Light API Demo
+
+```shell
+cd ../mobile_light
+make
+adb push mobilenetv1_light_api /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobilenetv1_light_api
+adb shell "/data/local/tmp/mobilenetv1_light_api --model_dir=/data/local/tmp/mobilenet_v1.opt  "
+```
+
+
+### 图像分类 Demo
+
+```shell
+cd ../mobile_classify
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxvf mobilenet_v1.tar.gz
+make
+adb push mobile_classify /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push labels.txt /data/local/tmp/
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobile_classify
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1.opt /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
+```
+
+### 目标检测 Demo
+
+```shell
+cd ../mobile_detection
+wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz
+tar zxvf mobilenetv1-ssd.tar.gz
+make
+adb push mobile_detection /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobile_detection
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && /data/local/tmp/mobile_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg"
+adb pull /data/local/tmp/test_detection_result.jpg ./
+```
+
+### light API Demo 运行结果
+
+运行成功后 ，将在控制台输出预测结果的前10个类别的预测概率：
+
+```shell
+Output dim: 1000
+Output[0]: 0.000191
+Output[100]: 0.000160
+Output[200]: 0.000264
+Output[300]: 0.000211
+Output[400]: 0.001032
+Output[500]: 0.000110
+Output[600]: 0.004829
+Output[700]: 0.001845
+Output[800]: 0.000202
+Output[900]: 0.000586
+```
+
+### 图像分类 Demo 运行结果
+
+运行成功后 ，将在控制台输出预测结果的前5个类别的类型索引、名字和预测概率：
+
+```shell
+parameter:  model_dir, image_path and label_file are necessary
+parameter:  topk, input_width,  input_height, are optional
+i: 0, index: 285, name:  Egyptian cat, score: 0.482870
+i: 1, index: 281, name:  tabby, tabby cat, score: 0.471593
+i: 2, index: 282, name:  tiger cat, score: 0.039779
+i: 3, index: 287, name:  lynx, catamount, score: 0.002430
+i: 4, index: 722, name:  ping-pong ball, score: 0.000508
+```
+
+### 目标检测 Demo 运行结果
+
+运行成功后 ，将在控制台输出检测目标的类型、预测概率和坐标：
+
+```shell
+running result:
+detection image size: 935, 1241, detect object: person, score: 0.996098, location: x=187, y=43, width=540, height=592
+detection image size: 935, 1241, detect object: person, score: 0.935293, location: x=123, y=639, width=579, height=597
+```
diff --git a/docs/user_guides/cuda.md b/docs/demo_guides/cuda.md
similarity index 73%
rename from docs/user_guides/cuda.md
rename to docs/demo_guides/cuda.md
index 45597057bb18c44b60234459f9a49a59b54135f6..8b3e76acef590bda19a59388017added6a0b8d52 100644
--- a/docs/user_guides/cuda.md
+++ b/docs/demo_guides/cuda.md
@@ -1,4 +1,4 @@
-# Lite基于CUDA的模型预测
+# PaddleLite使用CUDA预测部署
 
 Lite支持在x86_64，arm64架构上（如：TX2）进行CUDA的编译运行。
 
@@ -28,7 +28,27 @@ cd Paddle-Lite
 ./lite/tools/build.sh --build_python=ON cuda
 ```
 
-编译结束会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`。
+## 编译结果说明
+
+cuda的编译结果位于 `build_cuda/inference_lite_lib`
+**具体内容**说明：
+
+1、 `bin`文件夹：可执行工具文件，目前为空
+
+2、 `cxx`文件夹：包含c++的库文件与相应的头文件
+
+- `include`  : 头文件
+- `lib` : 库文件
+  - 打包的静态库文件：
+    - `libpaddle_api_full_bundled.a`  ：包含 full_api 和 light_api 功能的静态库
+  - 打包的动态态库文件：
+    - `libpaddle_full_api_shared.so` ：包含 full_api 和 light_api 功能的动态库
+
+3、 `third_party` 文件夹：第三方库文件
+
+4、 `demo` 文件夹：c++ demo.
+
+如果编译打开了python选项，则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`。
 
 ## 运行
 
@@ -36,7 +56,6 @@ cd Paddle-Lite
 
 一： 下载darknet_yolov3模型，模型信息请参考[这里](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/yolov3)
 
-
 ```
 # 下载模型
 wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/yolov3_infer.tar.gz
@@ -47,7 +66,7 @@ wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg
 
 二： 运行   
 
-**NOTE:**此处示例使用的是python接口，后续会开放C++接口以及示例。
+**NOTE:**此处示例使用的是python接口。
 
 ``` python
 #-*- coding: utf-8 -*-
@@ -107,4 +126,14 @@ print (output_tensor.float_data()[:6])
 
 ```
 
-**NOTE：** 对CUDA的支持还在持续开发中。
+**NOTE：** 此处示例使用的是C++接口。
+
+```
+cd build_cuda/inference_lite_lib/demo/cxx/
+mkdir build && cd build
+cmake ..
+make
+wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/yolov3_infer.tar.gz
+tar -zxf yolov3_infer.tar.gz
+./demo yolov3_infer
+```
diff --git a/docs/demo_guides/fpga.md b/docs/demo_guides/fpga.md
new file mode 100644
index 0000000000000000000000000000000000000000..f7885fd3b7f6600fe890332d2805a386008659e5
--- /dev/null
+++ b/docs/demo_guides/fpga.md
@@ -0,0 +1,106 @@
+# PaddleLite使用FPGA预测部署
+
+Paddle Lite支持基于arm的FPGA zu3/zu5/zu9的模型预测，提供armv8的交叉编译
+
+Lite基于FPGA运行模型需要相应的FPGA驱动，目前只支持百度[Edgeboard开发板](https://ai.baidu.com/tech/hardware/deepkit)
+
+## Lite实现FPGA简介
+
+Lite支持FPGA作为后端硬件进行模型推理，其主要特性如下：
+
+- Lite中FPGA的kernel（feed、fetch除外）均以FP16、NHWC的格式作为输入输出格式，所有的weights和bias仍为FP32、NCHW的格式，feed的输入和fetch的输出均为FP32、NCHW格式的数据，在提升计算速度的同时能做到用户对数据格式无感知
+
+- 对于FPGA暂不支持的kernel，均会切回arm端运行，实现arm+FPGA混合布署运行
+
+- 目前FPGA成本功耗都较低，Lite基于FPGA的模型性能远远好于arm端，可作为边缘设备首选硬件
+
+## 编译
+
+需要提前准备带有FPGAdrv.ko的FPGA开发板（如edgeboard开发板）和Lite代码
+
+CMAKE编译选项：
+
+- 设置`LITE_WITH_FPGA=ON`和`LITE_WITH_ARM=ON`
+
+其他编译选项与ARM编译相同，可以参考[“Paddle Lite在Docker下的ARM编译”](../user_guides/source_compile)。
+示例如下：
+```shell
+    cmake .. \
+        -DWITH_GPU=OFF \
+        -DWITH_MKL=OFF \
+        -DWITH_LITE=ON \
+        -DLITE_WITH_CUDA=OFF \
+        -DLITE_WITH_X86=OFF \
+        -DLITE_WITH_ARM=ON \
+        -DLITE_WITH_OPENMP=ON   \
+        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
+        -DWITH_TESTING=OFF \
+        -DLITE_WITH_FPGA=ON \
+        -DARM_TARGET_OS=armlinux 
+    make publish_inference -j2
+```
+Lite提供FPGA编译脚本，位于lite/tools/build_FPGA.sh，在Lite根目录执行该脚本即可编译
+
+## 运行示例
+
+- **运行文件准备**
+
+下面以Resnet50模型为例，介绍如何使用edgeboard开发板实现模型运行
+
+```bash
+#连接开发板，并利用screen命令启动 [本机执行]
+screen /dev/cu.SLAB_USBtoUART 115200
+#查看开发板ip并ssh登录到开发板，假设开发板ip为192.0.1.1 [本机执行]
+ssh root@192.0.1.1
+
+#在开发板上建立目录workspace，拷贝FPGA驱动FPGAdrv.ko到workspace目录 [开发板执行]
+mkdir workspace && scp $DRIVER_PATH/FPGAdrv.ko workspace
+
+#将Lite中编译好的测试程序拷贝到开发板workspace目录 [本机执行]
+scp $LITE_ROOT/build_FPGA/lite/api/test_resnet50_FPGA root@$EDGEBOARD_IP:workspace/
+#把Resnet50的模型和参数scp到开发板workspace目录 [本机执行]
+scp -r $LITE_ROOT/build_FPGA/lite/third_party/install/resnet50/ root@$EDGEBOARD_IP:workspace/
+
+#在运行模型前需要加载FPGA驱动 [开发板执行]
+insmod FPGAdrv.ko
+#给测试程序添加可运行权限 [开发板执行]
+chmod +x test_resnet50_FPGA
+```
+
+- **使用FPGA进行模型预测**
+
+```bash
+#以下命令均在开发板上运行
+#直接运行单测程序
+./test_resnet50_FPGA --model_dir=resnet50
+#如果需要测试性能，可以用repeats参数设置模型运行次数（如1000），同时可以设置预热次数（如10）来让硬件事先运行到稳定水平
+./test_resnet50_FPGA --model_dir=resnet50 --repeats=1000 --warmup=10
+```
+
+## 如何在Code中使用
+
+在Lite中使用FPGA与ARM相似，具体的区别如下：
+
+- 由于fpga运行模式为fp16精度、nhwc布局，所以需要修改相应的`valid_place`
+- fpga不需要device的初始化和运行模式设置
+
+代码示例：
+```cpp
+lite::Predictor predictor;
+std::vector<Place> valid_places(
+      {Place{TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)},Place{TARGET(kARM)});
+
+predictor.Build(model_dir, "", "", valid_places);
+
+auto* input_tensor = predictor.GetInput(0);
+input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+auto* data = input_tensor->mutable_data<float>();
+auto item_size = input_tensor->dims().production();
+//假设设置输入数据全为1
+for (int i = 0; i < item_size; i++) {
+  data[i] = 1;
+}
+
+predictor.Run();
+auto* out = predictor.GetOutput(0);
+```
diff --git a/docs/demo_guides/ios_app_demo.md b/docs/demo_guides/ios_app_demo.md
new file mode 100644
index 0000000000000000000000000000000000000000..2d9bbcbf83e1703a116d65c7ce8379638bd13cfe
--- /dev/null
+++ b/docs/demo_guides/ios_app_demo.md
@@ -0,0 +1,129 @@
+# iOS Demo
+
+## 多种应用场景
+
+我们提供Paddle-Lite示例工程[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)，其中包含[Android](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo)、[iOS](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-ios-demo)和[Armlinux](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-armlinux-demo)平台的示例工程。iOS demo涵盖[图像分类](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/image_classification_demo)、[目标检测](https://github.com/PaddlePaddle/Paddle-Lite-Demo/tree/master/PaddleLite-android-demo/object_detection_demo)2个应用场景。
+
+### 1. 图像分类
+
+图像分类是Paddle-Lite 提供的图像处理demo ，在移动端上提供了实时的物体识别能力，可以应用到生产线自动分拣或质检、识别医疗图像、辅助医生肉眼诊断等场景。在移动端预测的效果图如下：
+
+<p align="center"><img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/tabby_cat.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/tabby_cat2.jpg"/></p>
+
+### 2. 物体检测
+
+物体检测是Paddle-Lite 提供的图像识别demo ，在移动端上提供了检测多个物体的位置、名称、位置及数量的能力。可以应用到视频监控（是否有违规物体或行为）、工业质检（微小瑕疵的数量和位置）、医疗诊断（细胞计数、中药识别）等场景。在移动端预测的效果图如下：
+
+<p align="center"><img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/dog.jpg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="250" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/demo/dog2.jpg"/></p>
+
+## iOS demo部署方法
+
+下面我们以**目标检测（object_detection_demo)**为例讲解如何部署iOS工程。
+
+**目的**：将基于Paddle-Lite预测库的iOS APP部署到苹果手机，实现物体检测。
+
+**需要的环境**：Mac 电脑上安装Xcode、苹果手机、下载到本地的[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)工程
+
+**部署步骤**：
+
+1、 目标检测的iOS示例位于 `Paddle-Lite-Demo\PaddleLite-ios-demo\object_detection_demo`
+
+2、终端中执行 `download_dependencies.sh`脚本自动下载模型和Paddle-Lite预测库
+
+```shell
+cd PaddleLite-ios-demo          # 1. 终端中进入 Paddle-Lite-Demo\PaddleLite-ios-demo
+sh download_dependencies.sh     # 2. 执行脚本下载依赖项 （需要联网）
+```
+
+下载完成后会出现提示： `Extract done `
+
+3、用Xcode打开`object_detection_demo/detection_demo.xcodeproj`文件，修改工程配置。
+依次修改 `General/Identity`和`Signing&Capabilities`属性，替换为自己的工程代号和团队名称。（必须修改，不然无法通过编译）
+
+![Xcode1](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/Xcode1.png)
+
+
+
+![Xcode2](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/Xcode2.png)
+
+4、 IPhone手机连接电脑，在Xcode中连接自己的手机 （第一次连接IPhone到电脑时，需要在IPhone的`设置->通用->设备管理`中选择本电脑并信任）
+
+<p align="center"><img width="600" height="250"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/Xcode-phone.jpg"/>
+
+5、按下左上角的 Run按钮，自动编译APP并安装到手机。在苹果手机中设置信任该APP（进入`设置->通用->设备管理`，选中新安装的APP并`验证该应用`）
+
+成功后效果如下，图一：APP安装到手机        图二： APP打开后的效果，会自动识别图片中的物体并标记
+
+<p align="center"><img width="300" height="450"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/IOS2.jpeg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="300" height="450"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/IOS3.jpeg"/></p>
+
+## iOS demo结构讲解
+
+iOS 示例的代码结构如下图所示：
+
+<p align="center"><img width="600" height="450"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/iOS/IOS-struct.png"/>
+
+   1、 mobilenetv1-ssd： 模型文件 (opt 工具转化后Paddle-Lite模型)
+
+```shell
+# 位置：
+ios-detection_demo/detection_demo/models/mobilenetv1-ssd
+```
+
+  2、 libpaddle_api_light_bundled.a、paddle_api.h : Paddle-Lite C++ 预测库和头文件
+
+```shell
+# 位置：
+# iOS预测库
+ios-detection_demo/detection_demo/lib/libpaddle_api_light_bundled.a
+# 预测库头文件
+ios-detection_demo/detection_demo/include/paddle_api.h
+ios-detection_demo/detection_demo/include/paddle_use_kernels.h
+ios-detection_demo/detection_demo/include/paddle_use_ops.h
+```
+
+  3、 ViewController.mm：主要预测代码
+
+```shell
+# 位置
+ios-detection_demo/detection_demo/ViewController.mm
+```
+
+## 代码讲解 （如何使用Paddle-Lite C++ API 执行预测）
+
+IOS 示例基于C++ API 开发，调用Paddle-Lite C++ API包括以下五步。更详细的API 描述参考： [Paddle-Lite C++ API](https://paddle-lite.readthedocs.io/zh/latest/api_reference/java_api_doc.html)。
+
+```c++
+#include <iostream>
+// 引入C++ API
+#include "paddle_lite/paddle_api.h"
+#include "paddle_lite/paddle_use_ops.h"
+#include "paddle_lite/paddle_use_kernels.h"
+
+// 1. 设置MobileConfig
+MobileConfig config;
+config.set_model_from_file(<modelPath>); // 设置NaiveBuffer格式模型路径
+config.set_power_mode(LITE_POWER_NO_BIND); // 设置CPU运行模式
+config.set_threads(4); // 设置工作线程数
+
+// 2. 创建PaddlePredictor
+std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>(config);
+
+// 3. 设置输入数据
+std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+input_tensor->Resize({1, 3, 224, 224});
+auto* data = input_tensor->mutable_data<float>();
+for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+  data[i] = 1;
+}
+
+// 4. 执行预测
+predictor->run();
+
+// 5. 获取输出数据
+std::unique_ptr<const Tensor> output_tensor(std::move(predictor->GetOutput(0)));
+std::cout << "Output shape " << output_tensor->shape()[1] << std::endl;
+for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
+  std::cout << "Output[" << i << "]: " << output_tensor->data<float>()[i]
+            << std::endl;
+}
+```
diff --git a/docs/demo_guides/java_demo.md b/docs/demo_guides/java_demo.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad37e7b95dbd439ccc7393af27140a404e16cf07
--- /dev/null
+++ b/docs/demo_guides/java_demo.md
@@ -0,0 +1,99 @@
+# Java Demo
+
+本节中，Java demo 完整代码位于 [demo/java](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite/demo/java) 。
+
+要编译和跑起Android demo 程序 PaddlePredictor，你需要准备：
+
+1. 一台能运行安卓程序的安卓手机
+2. 一台带有AndroidStudio的开发机
+
+## 编译
+
+首先在PaddleLite的开发 [Docker镜像](../user_guides/source_compile) 中，拉取最新PaddleLite代码，编译对应你手机架构的预测库，
+下面我们以arm8 架构举例。进入paddlelite 目录，运行以下命令：
+
+```shell
+./lite/tools/build.sh        \
+    --arm_os=android         \
+    --arm_abi=armv8          \
+    --arm_lang=gcc           \
+    --android_stl=c++_static \
+    tiny_publish
+```
+
+命令完成后查看要存在
+
+```
+./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/java/so/libpaddle_lite_jni.so
+./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/java/jar/PaddlePredictor.jar
+./build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/java/android
+```
+
+libpaddle_lite_jni.so为 PaddleLite c++ 动态链接库，PaddlePredictor.jar为 Java jar 包，两者包含 PaddleLite Java API，接下来 Android Java 代码会使用这些api。android文件夹中则是Android demo。
+
+## 准备 demo 需要的其他文件
+
+Demo 除了代码，还需要准备在Android工程目录下配置好JNI .so 库（上节提到的`libpaddle_lite_jni.so`），Java .jar 包（上文提到的`PaddlePredictor.jar` ），和模型文件。我们提供了自动化的脚本和手动拷贝两种方法，用户可以根据自己需要选择：
+
+### 脚本方法
+
+进入 `build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/java/android`，我们准备了一个脚本`prepare_demo.bash`，脚本输入一个参数，为你要拷贝的.so 对应的架构文件夹名。
+
+例如运行
+
+```
+bash prepare_demo.bash arm8
+```
+
+该脚本自动下载并解压缩模型文件，拷贝了 .jar 包进demo，还有生成的.so包进`PaddlePredictor/app/src/main/jinLibs/架构文件夹下`，
+在我们这个例子里，armv8 就是架构文件夹。备注：这种方式构建的 demo 在 armv8 手机运行正常。如果要demo 程序在别的手机架构（如 armv7）上也运行正常，需要添加别的架构。
+
+### 手动拷贝方法
+
+接下来我们介绍手动拷贝，如果使用了脚本，那么可以跳过以下手动方法的介绍。
+
+### 把 .so 动态库和 .jar 拷贝进安卓demo程序：
+
+1. 将PaddlePredictor 载入到AndroidStudio。
+2. 将`libpaddle_lite_jni.so`拷贝进 `PaddlePredictor/app/src/main/jinLibs/架构文件夹下` ，比如文件夹arm8里要包含该 .so文件。
+3. 将 `PaddlePredictor.jar` 拷贝进 `PaddlePredictor/app/libs` 下
+
+### 把demo使用到的模型文件拷贝进安卓程序：
+
+下载我们的5个模型文件，并解压缩到 `PaddlePredictor/app/src/main/assets` 这个文件夹中
+需要拷贝的模型文件和下载地址：
+
+```
+inception_v4_simple_opt.nb http://paddle-inference-dist.bj.bcebos.com/inception_v4_simple_opt.nb.tar.gz
+lite_naive_model_opt.nb    http://paddle-inference-dist.bj.bcebos.com/lite_naive_model_opt.nb.tar.gz
+mobilenet_v1_opt.nb        http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1_opt.nb.tar.gz
+mobilenet_v2_relu_opt.nb   http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2_relu_opt.nb.tar.gz
+resnet50_opt.nb            http://paddle-inference-dist.bj.bcebos.com/resnet50_opt.nb.tar.gz
+```
+
+下载完后，assets文件夹里要包含解压后的上面五个模型文件夹，但demo里不需要保存原压缩.tar.gz 文件。
+
+注意：输入的模型要求为naive buffer存储格式，您可以通过 [**Model Optimize Tool**](../user_guides/model_optimize_tool) 将fluid模型转为naive buffer存储格式。
+
+## 运行 Android 程序结果
+
+以上准备工作完成，就可以开始Build 、安装、和运行安卓demo程序。当你运行PaddlePredictor 程序时，大概会等10秒，然后看到类似以下字样：
+
+```
+lite_naive_model output: 50.213173, -28.872887
+expected: 50.2132, -28.8729
+
+inception_v4_simple test:true
+time: xxx ms
+
+resnet50 test:true
+time: xxx ms
+
+mobilenet_v1 test:true
+time: xxx ms
+
+mobilenet_v2 test:true
+time: xxx ms
+```
+
+该 demo 程序跑我们的 5 个模型，第一个模型结果将真正的头两个数字输出，并在第二行附上期望的正确值。你应该要看到他们的误差小于0.001。后面四个模型如果你看到 `test:true` 字样，说明模型输出通过了我们在 demo 程序里对其输出的测试。time 代表该测试花费的时间。
diff --git a/docs/demo_guides/npu.md b/docs/demo_guides/npu.md
new file mode 100644
index 0000000000000000000000000000000000000000..0bdec8d73a881c186d9c4141e2d59a1b2bf11d8b
--- /dev/null
+++ b/docs/demo_guides/npu.md
@@ -0,0 +1,128 @@
+# PaddleLite使用NPU(华为)预测部署
+
+Paddle Lite是首款支持华为自研达芬奇架构NPU（Kirin 810/990 SoC搭载的NPU）的预测框架。
+原理是在线分析Paddle模型，将Paddle算子转成HiAI IR后，调用HiAI IR/Builder/Runtime APIs生成并执行HiAI模型。
+
+## 已支持的设备
+
+- 华为nova5、nova5i pro、mate30、mate30 pro、mate30 5G、荣耀v30，以及即将推出的mate40、p40。据华为透露，今后上市的大部分手机都会搭载其自研达芬奇架构NPU。
+
+## 已支持的模型
+
+- MobileNetV1
+- MobileNetV2
+- ResNet-18/50
+- ShuffleNetV2
+- CycleGAN (暂时需要华为内部rom的支持)
+- 百度内部业务模型（由于涉密，不方便透露具体细节）
+
+## 已支持（或部分支持）的Paddle算子
+
+- sigmoid
+- relu
+- tanh
+- relu_clipped
+- leaky_relu
+- softsign
+- hard_sigmoid
+- batch_norm
+- concat
+- conv2d
+- depthwise_conv2d
+- conv2d_transpose
+- dropout
+- elementwise_add
+- elementwise_sub
+- elementwise_mul
+- elementwise_div
+- fusion_elementwise_add_activation
+- fusion_elementwise_sub_activation
+- fusion_elementwise_mul_activation
+- fusion_elementwise_div_activation
+- fc
+- bilinear_interp
+- nearest_interp
+- matmul
+- mul
+- pad2d
+- pool2d
+- reduce_mean
+- reshape
+- reshape2
+- scale
+- shuffle_channel
+- softmax
+- split
+- sqrt
+- square
+- transpose
+- transpose2
+- unsqueeze
+- unsqueeze2
+- instance_norm (暂时需要华为内部rom的支持)
+- layer_norm (暂时需要华为内部rom的支持)
+
+## 编译支持NPU的Paddle Lite库
+
+- 从https://developer.huawei.com/consumer/cn/hiai/下载华为HiAI DDK后解压到任意路径（注意：华为提供了多个版本的DDK，我们需要下载针对麒麟810/990芯片HiAI Foundation开发套件，例如最新的[DDK V310版本](https://obs.cn-north-2.myhwclouds.com/hms-ds-wf/sdk/hwhiai-ddk-100.310.011.010.zip)）。
+- 将HiAI DDK中的ai_ddk_lib目录拷贝至Paddle Lite源码根目录后，使用[NPU编译脚本](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/tools/build_npu.sh)编译full_publish和tiny_publish。
+
+注意：以下是HiAI DDK V310版解压后的目录结构，需要将ai_ddk_lib目录拷贝至Paddle Lite源码根目录。
+```shell
+- app_sample
+- ddk
+  - ai_ddk_lib
+    - include
+    - lib # for armv7
+    - lib64 # for armv8
+- document
+- tools
+```
+
+- full_publish and tiny_publish for armv8，由于HiAI DDK的armv7和armv8的so库均基于c++_shared构建，因此，建议使用c++_shared编译Paddle Lite。
+```shell
+$ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_shared full_publish
+$ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_shared tiny_publish
+```
+
+- full_publish and tiny_publish for armv7
+```shell
+$ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_shared full_publish
+$ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --android_stl=c++_shared tiny_publish
+```
+
+注意：为了保证编译环境一致，建议参考[源码编译](../user_guides/source_compile)中的Docker开发环境进行配置，然后再执行上述命令。
+
+## 优化生成NPU模型
+
+- model_optimize_tool工具已经支持生成NPU模型，仅需要将valid_targets设置为npu,arm即可，具体参考[模型转化方法](../user_guides/model_optimize_tool)。
+```shell
+./model_optimize_tool --model_dir=<model_param_dir> \
+    --model_file=<model_path> \
+    --param_file=<param_path> \
+    --optimize_out_type=(protobuf|naive_buffer) \
+    --optimize_out=<output_optimize_model_dir> \
+    --valid_targets=npu,arm \
+    --record_tailoring_info =(true|false)
+```
+- model_optimize_tool生成的模型只是标记了NPU支持的Paddle算子，并没有真正生成NPU HiAI模型，只有在执行时才会将标记的Paddle算子转成HiAI IR，最终生成并执行HiAI模型，具体实现参考PR[2576](https://github.com/PaddlePaddle/Paddle-Lite/pull/2576)。
+- 不同模型，不同型号（ROM版本）的华为手机，在执行阶段，由于某些Paddle算子无法完全转成HiAI IR，或目标手机的HiAI版本过低等原因，可能导致HiAI模型无法成功生成，在这种情况下，Paddle Lite会调用CPU版算子进行运算完成整个预测任务。
+
+## 通过JAVA接口加载并执行NPU模型
+
+- 使用方法和[Java实例](java_demo)一致，无需额外设置任何参数，只需将模型换成NPU模型即可。[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中的Image Classification Demo for Android是同时支持CPU和NPU两种模型的图像分类Demo。
+
+注意：在拷贝libpaddle_lite_jni.so的时候，由于依赖HiAI DDK so和libc++_shared.so库，需要将HiAI DDK中ai_ddk_lib/lib或ai_ddk_lib/lib64目录下的所有so和libc++_shared.so，拷到libpaddle_lite_jni.so同级目录下。
+
+## 通过C++接口加载并执行NPU模型
+
+- 使用方法和[C++实例](cpp_demo)一致，同样无需额外设置任何参数，只需将模型换成NPU模型即可。
+
+注意：1）不能使用安卓模拟器，需要使用真实设备，且必须是支持NPU的华为手机。2）在使用adb push命令向手机推送目标程序时，需要将HiAI DDK中ai_ddk_lib/lib或ai_ddk_lib/lib64目录下的所有so和libc++_shared.so，推送到目标程序同级目录下。
+
+
+## 其它说明
+
+- 华为达芬奇架构的NPU内部大量采用float16进行运算，因此，预测结果会存在偏差，但大部分情况下精度不会有较大损失，可参考[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中Image Classification Demo for Android对同一张图片CPU与NPU的预测结果。
+- 华为Kirin 810/990 Soc搭载的自研达芬奇架构的NPU，与Kirin 970/980 Soc搭载的寒武纪NPU不一样，同样的，与Hi3559A、Hi3519A使用的NNIE也不一样，Paddle Lite只支持华为自研达芬奇架构NPU。
+- 我们正在持续增加能够适配HiAI IR的Paddle算子bridge/converter，以便适配更多Paddle模型，同时华为研发同学也在持续对HiAI IR性能进行优化。
diff --git a/docs/user_guides/opencl.md b/docs/demo_guides/opencl.md
similarity index 53%
rename from docs/user_guides/opencl.md
rename to docs/demo_guides/opencl.md
index e9533af1ff6e2447a8e4d389df90cdb457f58fb2..e255038575796f0c1079f47fb859f8402ac79c1f 100644
--- a/docs/user_guides/opencl.md
+++ b/docs/demo_guides/opencl.md
@@ -1,4 +1,4 @@
-# Lite基于OpenCL的ARM GPU预测
+# PaddleLite使用OpenCL预测部署
 
 Lite支持在Android系统上运行基于OpenCL的程序，目前支持Ubuntu环境下armv8、armv7的交叉编译。
 
@@ -11,18 +11,45 @@ Lite支持在Android系统上运行基于OpenCL的程序，目前支持Ubuntu环
 
 详见 **源码编译指南-环境准备** 章节。
 
-### 编译选项
-
-|参数|介绍|值|
-|--------|--------|--------|
-|--arm_os|代表目标操作系统|目前仅支持且默认为`android`|
-|--arm_abi|代表体系结构类型，支持armv8和armv7|默认为`armv8`即arm64-v8a；`armv7`即armeabi-v7a|
-|--arm_lang|代表编译目标文件所使用的编译器|默认为gcc，支持 gcc和clang两种|
-
 ### 编译Paddle-Lite OpenCL库范例
 
 注：以android-armv8-opencl的目标、Docker容器的编译开发环境为例，CMake3.10，android-ndk-r17c位于`/opt/`目录下。
 
+#### 针对 Lite 用户的编译命令(无单元测试,有编译产物)
+
+- `arm_os`: `[android]`，目前不支持linux；
+- `arm_abi`: `[armv7 | armv8]`；
+- `arm_lang`: `[gcc]`，目前不支持clang；
+- `build_extra`: `[OFF | ON]`，编译全量op和kernel，体积会大，编译时间长；
+- `build_cv`: `[OFF | ON]`，编译arm cpu neon实现的的cv预处理模块；
+- `android_stl`: `[c++_shared | c++_static]`，paddlelite的库以何种方式链接`android_stl`，选择`c++_shared`得到的动态库体积更小，但使用时候记得上传paddlelite所编译版本（armv7或armv8）一致的`libc++_shared.so`（来自Android-NDK）；
+注：调用`./lite/tools/build.sh`执行编译。
+
+```bash
+# 假设当前位于处于Lite源码根目录下
+
+# 导入NDK_ROOT变量，注意检查您的安装目录若与本示例不同
+export NDK_ROOT=/opt/android-ndk-r17c
+
+# 删除上一次CMake自动生成的.h文件
+rm ./lite/api/paddle_use_kernels.h
+rm ./lite/api/paddle_use_ops.h
+
+# 根据指定编译参数编译
+./lite/tools/build.sh \
+  --arm_os=android \
+  --arm_abi=armv8 \
+  --arm_lang=gcc \
+  --build_extra=OFF \
+  --build_cv=OFF \
+  --android_stl=c++_shared \
+  opencl
+```
+
+#### 针对 Lite 开发者的编译命令(有单元测试,编译产物)
+
+注：调用`./lite/tools/ci_build.sh`执行编译，该命令会编译armv7和armv8的opencl库。虽然有编译产物，但因编译单元测试，编译产物包体积可能较大，不推荐使用。
+
 ```bash
 # 假设当前位于处于Lite源码根目录下
 
@@ -38,16 +65,20 @@ rm ./lite/api/paddle_use_ops.h
   --arm_os=android \
   --arm_abi=armv8 \
   --arm_lang=gcc \
-  build_test_arm_opencl
+  build_opencl
 ```
 
+注：如果要调试cl kernel，假设已经完成上述脚本编译(已生成cmake文件)。调试只需要修改`./lite/backends/opencl/cl_kernel/`下对应的kernel文件，保存后在项目根目录执行`python ./lite/tools/cmake_tools/gen_opencl_code.py ./lite/backends/opencl/cl_kernel ./lite/backends/opencl/opencl_kernels_source.cc`，该命令会自动将修改后，再切到build目录下执行`make publish_inference`或者你要编译的单测的可执行文件名，cl kernel文件的内容会随着编译自动打包到产物包如 .so 中或者对应单测可执行文件中。
+
+### 编译产物说明
+
 编译产物位于`build.lite.android.armv8.gcc.opencl`下的`inference_lite_lib.android.armv8.opencl`文件夹内，这里仅罗列关键产物：
 
 - `cxx`:该目录是编译目标的C++的头文件和库文件;
 - `demo`:该目录包含了两个demo，用来调用使用`libpaddle_api_full_bundled.a`和`libpaddle_api_light_bundled.a`，分别对应`mobile_full`和`mobile_light`文件夹。编译对应的demo仅需在`mobile_full`或`mobile_light`文
   - `mobile_full`:使用cxx config，可直接加载fluid模型，若使用OpenCL需要在`mobilenetv1_full_api.cc`代码里开启`DEMO_USE_OPENCL`的宏，详细见代码注释;
-  - `mobile_light`:使用mobile config，只能加载`model_optimize_tool`优化过的模型;
-- `opencl`:该目录存放opencl实现的相关kernel。
+  - `mobile_light`:使用mobile config，只能加载`model_optimize_tool`优化过的模型。
+注：`opencl`实现的相关kernel已经打包到动态库中。
 
 ```bash
 .
@@ -65,40 +96,23 @@ rm ./lite/api/paddle_use_ops.h
 |       |-- libpaddle_api_light_bundled.a
 |       |-- libpaddle_full_api_shared.so
 |       `-- libpaddle_light_api_shared.so
-|-- demo
-|   `-- cxx
-|       |-- Makefile.def
-|       |-- README.md
-|       |-- include
-|       |   |-- paddle_api.h
-|       |   |-- paddle_lite_factory_helper.h
-|       |   |-- paddle_place.h
-|       |   |-- paddle_use_kernels.h
-|       |   |-- paddle_use_ops.h
-|       |   `-- paddle_use_passes.h
-|       |-- mobile_full
-|       |   |-- Makefile
-|       |   `-- mobilenetv1_full_api.cc
-|       `-- mobile_light
-|           |-- Makefile
-|           `-- mobilenetv1_light_api.cc
-`-- opencl
-    `-- cl_kernel
-        |-- buffer
-        |   |-- depthwise_conv2d_kernel.cl
-        |   |-- elementwise_add_kernel.cl
-        |   |-- fc_kernel.cl
-        |   |-- im2col_kernel.cl
-        |   |-- layout_kernel.cl
-        |   |-- mat_mul_kernel.cl
-        |   |-- pool_kernel.cl
-        |   `-- relu_kernel.cl
-        |-- cl_common.h
-        `-- image
-            |-- channel_add_kernel.cl
-            |-- elementwise_add_kernel.cl
-            |-- pool_kernel.cl
-            `-- relu_kernel.cl
+`-- demo
+    `-- cxx
+        |-- Makefile.def
+        |-- README.md
+        |-- include
+        |   |-- paddle_api.h
+        |   |-- paddle_lite_factory_helper.h
+        |   |-- paddle_place.h
+        |   |-- paddle_use_kernels.h
+        |   |-- paddle_use_ops.h
+        |   `-- paddle_use_passes.h
+        |-- mobile_full
+        |   |-- Makefile
+        |   `-- mobilenetv1_full_api.cc
+        `-- mobile_light
+            |-- Makefile
+            `-- mobilenetv1_light_api.cc
 ```
 
 调用`libpaddle_api_full_bundled.a`和`libpaddle_api_light_bundled.a`见下一部分运行示例。
@@ -109,48 +123,9 @@ rm ./lite/api/paddle_use_ops.h
 
 下面以android、ARMv8、gcc的环境为例，介绍3个示例，分别如何在手机上执行基于OpenCL的ARM GPU推理过程。
 
-
-**注意：** 以下命令均在Lite源码根目录下运行。在3个示例前，下面这段命令都先要执行用来准备环境:
-
-```bash
-# 在/data/local/tmp目录下创建OpenCL文件目录
-adb shell mkdir -p /data/local/tmp/opencl
-adb shell mkdir -p /data/local/tmp/opencl/cl_kernel/buffer
-adb shell mkdir -p /data/local/tmp/opencl/cl_kernel/image
-
-# 将OpenCL的kernels文件推送到/data/local/tmp/opencl目录下
-adb push lite/backends/opencl/cl_kernel/cl_common.h /data/local/tmp/opencl/cl_kernel/
-adb push lite/backends/opencl/cl_kernel/buffer/* /data/local/tmp/opencl/cl_kernel/buffer/
-adb push lite/backends/opencl/cl_kernel/image/* /data/local/tmp/opencl/cl_kernel/image/
-```
-
 ### 运行示例1: 编译产物demo示例
 
 ```bash
-######################################################################
-# 编译mobile_full的demo                                              #
-######################################################################
-# 步骤:                                                              #
-#   0.确保编译Paddle-Lite时编译了OpenCL;                             #
-#   1.编辑`mobilenetv1_full_api.cc`代码, 开启`DEMO_USE_OPENCL`的宏;  #
-#   2.在产物目录`demo/cxx/mobile_full`下编译`mobile_full`的demo;     #
-#   3.上传demo, 模型, opencl kernel文件到手机;                       #
-#   4.运行demo得到预期结果.                                          #
-######################################################################
-adb shell mkdir /data/local/tmp/opencl/mobilenet_v1
-chmod +x ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_full/mobilenetv1_full_api
-adb push ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_full/mobilenetv1_full_api /data/local/tmp/opencl/
-adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1
-
-# use mobile_full run mobilenet_v1
-# `GLOG_v` is log level
-adb shell "export GLOG_v=0; \
-    /data/local/tmp/opencl/mobilenetv1_full_api \
-    --model_dir=/data/local/tmp/opencl/mobilenet_v1 \
-    --optimized_model_dir=/data/local/tmp/opencl/full_api_opt_model"
-
-
-
 ######################################################################
 # 编译mobile_light的demo                                             #
 ######################################################################
@@ -158,33 +133,40 @@ adb shell "export GLOG_v=0; \
 #   0.确保编译Paddle-Lite时编译了OpenCL;                             #
 #   1.编译model_optimize_tool并对模型优化, `targets`参数为`opencl`;  #
 #   2.在产物目录`demo/cxx/mobile_light`下编译`mobile_light`的demo;   #
-#   3.上传demo, 模型, opencl kernel文件到手机;                       #
+#   3.上传demo, 模型文件到手机;                                      #
 #   4.运行demo得到预期结果.                                          #
 ######################################################################
+# 在/data/local/tmp目录下创建OpenCL文件目录
+adb shell mkdir -p /data/local/tmp/opencl
 
 # use model_optimize_tool to optimize model
 ./build.model_optimize_tool/lite/api/model_optimize_tool \
   --model_dir=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/ \
   --optimize_out_type=naive_buffer \
-  --optimize_out=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/ \
+  --optimize_out=./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/mobilenetv1_opt \
   --valid_targets=opencl
 
-adb shell mkdir /data/local/tmp/opencl/mobilenet_v1
+adb shell mkdir /data/local/tmp/opencl/mobilenet_v1/
 chmod +x ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_light/mobilenetv1_light_api
 adb push ./build.lite.android.armv8.gcc.opencl/inference_lite_lib.android.armv8.opencl/demo/cxx/mobile_light/mobilenetv1_light_api /data/local/tmp/opencl/
-adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1
+adb push ./build.lite.android.armv8.gcc.opencl/install/mobilenet_v1/mobilenetv1_opt.nb /data/local/tmp/opencl/
 
 # use mobile_light run mobilenet_v1
-adb shell "export GLOG_v=5; \
+adb shell "export GLOG_v=1; \
   /data/local/tmp/opencl/mobilenetv1_light_api \
-  --model_dir=/data/local/tmp/opencl/"
+  /data/local/tmp/opencl/mobilenetv1_opt.nb"
 ```
 
+**注：** `GLOG_v`是指定需要显示VLOG的日志级别，默认为0。权重参数会在第一次运行时加载，所以第一次执行时间略长。一般将warmup的值设为10，repeats值设为多次。
+
 ### 运行示例2: test_mobilenetv1单元测试
 
 - **运行文件准备**
 
 ```bash
+# 在/data/local/tmp目录下创建OpenCL文件目录
+adb shell mkdir -p /data/local/tmp/opencl
+
 # 将mobilenet_v1的模型文件推送到/data/local/tmp/opencl目录下
 adb shell mkdir -p /data/local/tmp/opencl/mobilenet_v1
 adb push build.lite.android.armv8.gcc.opencl/third_party/install/mobilenet_v1/* /data/local/tmp/opencl/mobilenet_v1/
@@ -195,42 +177,26 @@ adb push build.lite.android.armv8.gcc.opencl/lite/api/test_mobilenetv1 /data/loc
 
 - **执行OpenCL推理过程**
 
-使用如下命令运行OpenCL程序。其中：
-
-- `--cl_path`指定了OpenCL的kernels文件即cl\_kernel所在目录；
-- `--modle_dir`指定了模型文件所在目录。
-
 ```bash
 adb shell chmod +x /data/local/tmp/opencl/test_mobilenetv1
 
-adb shell /data/local/tmp/opencl/test_mobilenetv1 \
-  --cl_path=/data/local/tmp/opencl \
-  --model_dir=/data/local/tmp/opencl/mobilenet_v1 \
-  --warmup=1 \
-  --repeats=1
+adb shell "export GLOG_v=1; \
+   /data/local/tmp/opencl-image/test_mobilenetv1 \
+  --model_dir=/data/local/tmp/opencl-image/mobilenetv1_fluid/ \
+  --warmup=10 \
+  --repeats=100"
 ```
 
-**注意：** 因为权重参数均会在Op Kernel第一次运行时进行加载，所以第一次的执行时间会略长。一般将warmup的值设为1，repeats值设为多次。
-
 ### 运行示例3: test_layout_opencl单元测试
 
-- **运行文件准备**
-
-```bash
-# 将OpenCL单元测试程序test_layout_opencl，推送到/data/local/tmp/opencl目录下
-adb push build.lite.android.armv8.gcc.opencl/lite/kernels/opencl/test_layout_opencl /data/local/tmp/opencl/
-```
-
-
-OpenCL推理过程**
-
 ```bash
+adb shell mkdir -p /data/local/tmp/opencl
 adb shell chmod +x /data/local/tmp/opencl/test_layout_opencl
-adb shell /data/local/tmp/opencl/test_layout_opencl
+adb shell "export GLOG_v=4; \
+  /data/local/tmp/opencl/test_layout_opencl"
 ```
 
-
-# 如何在Code中使用
+### 如何在Code中使用
 
 见运行示例1的demo代码:
 
diff --git a/docs/advanced_user_guides/x86.md b/docs/demo_guides/x86.md
similarity index 53%
rename from docs/advanced_user_guides/x86.md
rename to docs/demo_guides/x86.md
index 7cb08683440312b0349662699b05e99df0cb6df1..c65ca99006b924488ceee50489e3d5654bae990c 100644
--- a/docs/advanced_user_guides/x86.md
+++ b/docs/demo_guides/x86.md
@@ -1,6 +1,6 @@
-# 使用X86预测库
+# PaddleLite使用X86预测部署
 
-Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考[环境准备](../installation/source_compile)。
+Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考[环境准备](../user_guides/source_compile)。
 
 (注意：非docker Linux环境需要是Ubuntu16.04)
 
@@ -9,8 +9,8 @@ Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考
 1、 下载代码
 ```bash
 git clone https://github.com/PaddlePaddle/Paddle-Lite.git
-#需要切换到 release/v2.0.0之后版本
-git checkout <release_tag>
+# 切换到release分支
+git checkout release/v2.3
 ```
 
 2、 源码编译
@@ -42,43 +42,56 @@ x86编译结果位于 `build.lite.x86/inference_lite_lib`
 
 ## x86预测API使用示例
 
+1、我们提供Linux环境下x86 API运行mobilenet_v1的示例：[mobilenet_full_x86demo](https://paddlelite-data.bj.bcebos.com/x86/mobilenet_full_x86demo.zip)。下载解压后内容如下：
+
+![](https://paddlelite-data.bj.bcebos.com/x86/x86-doc/demo.png)
+
+`mobilenet_v1`为模型文件、`lib`和`include`分别是Paddle-Lite的预测库和头文件、`third_party`下是编译时依赖的第三方库`mklml`、`mobilenet_full_api.cc`是x86示例的源代码、`build.sh`为编译的脚本。
+
+2、demo内容与使用方法
+
+``` bash
+# 1、编译
+sh build.sh
+```
+编译结果为当前目录下的 `mobilenet_full_api `
+``` bash
+# 2、执行预测
+mobilenet_full_api mobilenet_v1
+```
+`mobilenet_v1`为当前目录下的模型路径，`mobilenet_full_api`为第一步编译出的可执行文件。
+
+3、示例源码`mobilenet_full_api.cc`
+
 ```c++
-#include <gflags/gflags.h>
 #include <iostream>
 #include <vector>
-#include "paddle_api.h"          // NOLINT
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
-#include "paddle_use_passes.h"   // NOLINT
+#include "paddle_api.h"
 
-using namespace paddle::lite_api;  // NOLINT
 
-DEFINE_string(model_dir, "", "Model dir path.");
-DEFINE_string(optimized_model_dir, "", "Optimized model dir.");
-DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
+using namespace paddle::lite_api;  // NOLINT
 
 int64_t ShapeProduction(const shape_t& shape) {
   int64_t res = 1;
   for (auto i : shape) res *= i;
   return res;
 }
-void RunModel() {
-  // 1. Set CxxConfig
-  CxxConfig config;
-  config.set_model_file(FLAGS_model_dir + "model");
-  config.set_param_file(FLAGS_model_dir + "params");
-
-  config.set_valid_places({
-    lite_api::Place{TARGET(kX86), PRECISION(kFloat)}
-  });
 
+void RunModel(std::string model_dir) {
+   // 1. Create CxxConfig
+   CxxConfig config;
+   config.set_model_dir(model_dir);
+   config.set_valid_places({
+     Place{TARGET(kX86), PRECISION(kFloat)},
+     Place{TARGET(kHost), PRECISION(kFloat)}
+   });
   // 2. Create PaddlePredictor by CxxConfig
   std::shared_ptr<PaddlePredictor> predictor =
       CreatePaddlePredictor<CxxConfig>(config);
 
   // 3. Prepare input data
   std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
-  input_tensor->Resize(shape_t({1, 3, 224, 224}));
+  input_tensor->Resize({1, 3, 224, 224});
   auto* data = input_tensor->mutable_data<float>();
   for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
     data[i] = 1;
@@ -90,15 +103,21 @@ void RunModel() {
   // 5. Get output
   std::unique_ptr<const Tensor> output_tensor(
       std::move(predictor->GetOutput(0)));
-  std::cout << "Output dim: " << output_tensor->shape()[1] << std::endl;
+  std::cout << "Output shape " << output_tensor->shape()[1] << std::endl;
   for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
-    std::cout << "Output[" << i << "]:" << output_tensor->data<float>()[i] << std::endl;
+    std::cout << "Output[" << i << "]: " << output_tensor->data<float>()[i]
+              << std::endl;
   }
 }
 
 int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
-  RunModel();
+  if (argc < 2) {
+    std::cerr << "[ERROR] usage: ./" << argv[0] << " naive_buffer_model_dir\n";
+    exit(1);
+  }
+  std::string model_dir = argv[1];
+  RunModel(model_dir);
   return 0;
 }
+
 ```
diff --git a/docs/advanced_user_guides/add_layout.md b/docs/develop_guides/add_layout.md
similarity index 99%
rename from docs/advanced_user_guides/add_layout.md
rename to docs/develop_guides/add_layout.md
index 11e504f93c2b1bcaefaa06c0a5f51aea0995884e..26b7a07cc5788ee6e7fa36206c2432f5fc3def1c 100644
--- a/docs/advanced_user_guides/add_layout.md
+++ b/docs/develop_guides/add_layout.md
@@ -1,4 +1,4 @@
-# 如何增加Layout
+# 新增Layout
 
 Paddle-Lite中Place包含了Target、Layout、Precision信息，用来注册和选择模型中的具体Kernel。下面以增加Place中的layout：`ImageDefault`、`ImageFolder`、`ImageNW`为例，讲解如何增加新Layout。
 
diff --git a/docs/advanced_user_guides/add_new_pass.md b/docs/develop_guides/add_new_pass.md
similarity index 99%
rename from docs/advanced_user_guides/add_new_pass.md
rename to docs/develop_guides/add_new_pass.md
index 93b27cd038642c702cd213adffcc378dc852a1b3..5740b7978f18cfad5754c0f77a8208bece565893 100644
--- a/docs/advanced_user_guides/add_new_pass.md
+++ b/docs/develop_guides/add_new_pass.md
@@ -1,5 +1,4 @@
-
-# 新增Pass方法
+# 新增Pass
 
 本文从三个方面介绍了`Lite`中的`Pass`结构：**Pass是什么**、**Pass的实现与接口**、**Pass的一般注册流程**。最后以`Fc_fuse_pass`为例介绍了`fusion_pass`的作用与注册方法。
 
diff --git a/docs/advanced_user_guides/add_operation.md b/docs/develop_guides/add_operation.md
similarity index 99%
rename from docs/advanced_user_guides/add_operation.md
rename to docs/develop_guides/add_operation.md
index 525832f8a9d7341c3124498084e05b160358b2ad..1aa955fa6a1b260fd3a17401e658e33b2b862fd9 100644
--- a/docs/advanced_user_guides/add_operation.md
+++ b/docs/develop_guides/add_operation.md
@@ -1,4 +1,4 @@
-# 新增OP的方法
+# 新增OP
 
 以下以添加argmax为例，详细说明新增op的方法。
 
diff --git a/docs/develop_guides/architecture-intro.md b/docs/develop_guides/architecture-intro.md
new file mode 100644
index 0000000000000000000000000000000000000000..f49f0525e122de9da19bacb441dfa84ab0eef7ca
--- /dev/null
+++ b/docs/develop_guides/architecture-intro.md
@@ -0,0 +1,245 @@
+# 架构详解
+
+这篇文档会从开发者角度详细介绍开发 Paddle-Lite 需要的相关信息。
+
+## 设计及思考
+
+近年来，各种深度学习预估硬件称出不穷，从手机APP到车载设备，再到音箱，均需要部署深度学习预测，且有如下共性需求：
+
+1. 高性能
+2. 硬件支持和扩展容易
+3. 轻量级部署
+
+Paddle-Lite 的架构方面便是定向参考如上需求设计实现的，具体地
+
+- 高性能方面
+  - 通过 MIR(Machine IR) 实现精细复杂的计算图的分析和优化
+  - 执行期 Kernel 的简单设计，几乎没有额外调度开销
+  - 适当的硬件层抽象，框架支持各个硬件后端中做特定的调度实现
+- 轻量级部署方面
+  - 拆分分析和执行两个阶段，执行阶段轻量级实现，可以单独部署
+  - 轻量级 Op 和 Kernel 设计
+- 硬件支持和扩展方面
+  - 通过 MIR 支撑带硬件和执行信息的宏观分析优化
+  - TypeSystem 抽象带硬件的不同计算模式的表示，实现整个计算图的强类型推导，以及执行状态机的静态分析
+
+Paddle-Lite 的架构尝试从强类型推导的角度建模支持多硬件，多种计算模式（不同量化精度、不同的 data layout等）的混合计算，从而实现宏观上的各异硬件和计算模式的混合。
+
+框架部分已经经过 FPGA，GPU，NPU 等异构硬件的打磨，各项能力也在完善中。
+
+## 重要模块介绍
+
+### OpLite
+
+[OpLite](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/op_lite.h#L52) 是 Paddle-Lite 中的 Operator，用户扩展单个硬件时，最多的就是扩展 Op 和 Kernel。
+
+重要方法如下：
+
+```c++
+class OpLite : public Registry {
+ public:
+  // Check the shape.
+  virtual bool CheckShape() const { return true; }
+  // Inference the outputs' shape.
+  virtual bool InferShape() const { return true; }
+  // Link the external execution environ to internal context.
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope);
+};
+```
+
+其中，分析期执行
+
+- `AttachImpl`
+
+执行期执行
+
+- `CheckShape`
+- `InferShape`
+
+扩展须知：
+
+1. `CheckShape` 只在第一个 batch 执行，所以耗时不敏感
+
+2. `InferShape` 需要在每个 batch 执行，应该严格耗时
+
+   1. 可以通过添加 member variable 的方式，对其中一部分信息增加 cache，比如
+
+   ```c++
+   class XXOp : public OpLite {
+       void InferShape() {
+           int batch_size = param().input.shape[0];
+           if (!shape_cache_.empty()) {
+               shape_cache_[0] = batch_size;
+               param().output->Resize(shape_cache_);
+           }
+       }
+       
+    private:
+       shape_t shape_cache_;
+   }
+   ```
+
+   
+
+### OpParam
+
+[OpParam](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/operators/op_params.h) 用于存储执行期 Kernel 需要的各项参数。 所有字段可以直接存储（比如指针或者 `int`），以避免执行中获取参数的延迟。
+
+因为没有需求，OpParam 暂时没有设置基类。
+
+实际例子：
+
+```c++
+// For Softmax op
+struct SoftmaxParam {
+  lite::Tensor* x{};
+  lite::Tensor* output{};
+  int axis{-1};
+};
+```
+
+OpLite 的 `AttachImpl` 方法就用于构建 `OpParam` ，复制传递给 `Kernel` 用于执行。
+
+OpParam  是执行期的重要模块，需要严格保证性能，相应的扩展要求：
+
+1. 字段的获取必须是低延迟的，可以直接用指针，或者直接复制值
+2. 避免执行无关信息混入，包括 debug 信息
+3. 命名需要与 Paddle OpDesc 中的信息严格一致，以降低功能对齐和理解的难度
+
+### Kernel
+
+```c++
+template <TargetType Target,
+          PrecisionType Precision,
+          DataLayoutType DataLayout = DataLayoutType::kNCHW>
+class KernelLite : public KernelBase {
+ public:
+  // Run the kernel.
+  virtual void Run() { CHECK(false) << "Not Implemented"; }
+
+  TargetType target() const override { return Target; }
+  PrecisionType precision() const override { return Precision; }
+  DataLayoutType layout() const override { return DataLayout; }
+  Place place() const override { return Place{Target, Precision, DataLayout}; }
+  std::string name() const override;
+};
+```
+
+由于是执行期的重要概念，因此 Kernel 设计地非常简单高效。 
+
+其中，执行期的 `Run` 是其唯一重要的接口，其中包含具体的计算逻辑。
+
+模板中的参数主要用于方便多硬件编译，以及自解释：
+
+- Target: 执行硬件
+- Precision: 主要的计算精度
+- DataLayout：主要计算的 data layout
+
+这部分信息用于帮助挑选 kernel，具体的值并不严格。
+
+
+
+Kernel 的注册需要用到 TypeSystem，不光对 Kernel 本身的特性进行描述，对其输入和输出均进行详尽的定义。
+
+例如 FullyConnected 的注册
+
+```c++
+REGISTER_LITE_KERNEL(
+    fc, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::FcCompute, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat), LAYOUT(kNCHW))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+```
+
+Kernel自身定义是 `kARM` 的，也就是ARM上的kernel，主要的计算精度是 `kFloat`，主要的 Data layout 是 `kNCHW`。
+
+接着会对其所有的输入和输出做详细定义，比如看 `Input` 输入的定义是 `LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat), LAYOUT(kNCHW))`，也就是声明其 Target 是 `kARM`， PRECISION 是 `kFloat`，Data Layout 是 `kNCHW`。
+
+这里的设计思想是类似C++中的函数重载，同一个 Kernel（的名字），在重载了其输入输出的类型之后可以是不同的kernel。
+
+#### 扩展须知
+
+1. 模板参数选用计算中主要的来表示
+   1. 比如，scale kernel，同时能接受 `float` 和 `int` 的输入，但其不算量化 kernel，那应该设置为 `Precision=float`，代表常规的计算精度中使用
+2. Kernel 输入输出的定义需要足够精确，是什么类型就是什么类型；框架会根据其输入输出的定义来动态构建状态机，否则会出现分析期和执行期的状态机不一致，造成未定义行为
+
+### MIR
+
+MIR 类似于 LLVM 里的 IR，只是加上了硬件和执行期的信息参与分析优化。
+
+Pass 是MIR中的模块化策略，其输入和输出都是 SSA Graph.
+
+框架会自动基于模型的Program 构建 SSA Graph，之后按 [Optimizer](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/optimizer.h) 中定义的pass的顺序调用一系列 Pass。
+
+#### Op Fusion
+
+MIR 中的 [PatternMacher](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/mir/pattern_matcher.h) 实现了简单有效的基于图的模板识别的算法，相关的 op fusion 的图操作可以基于此实现。
+
+实际的例子可以参考 [fc_fuse_pass.h](https://github.com/PaddlePaddle/Paddle-Lite/blob/v2.0.0-beta1-prerel/lite/core/mir/fusion/fc_fuse_pass.h)。
+
+### TypeSystem
+
+TypeSystem 是 Paddle-Lite 中构建复杂计算图的基础模块，核心思想是协助 SSA Graph 构建一个状态机，表示其中不同的状态。
+
+这里的 Type 主要包含下面四组信息，更多的信息可以按需扩展：
+
+- TargetType
+- Precision
+- DataLayout
+- device id，用于表示卡号
+
+
+
+状态机的表示：
+
+```python
+Tensor0(kARM, kFloat, kNCHW) --pass--> Tensor1(kOpenCL, kFloat, kNCHW)
+```
+
+MIR 会识别出，Tensor0 和 Tensor1 的硬件位置不同，因此触发相依的 Pass 插入对应的 cast op 来进行 type cast，比如
+
+```
+Tensor0(kARM, kFloat, kNCHW) --pass-> IoCopyOp(kARM, kOpenCL) --pass-> Tensor1(kOpenCL, kFloat, kNCHW)
+```
+
+### KernelContext
+
+KernelContext 是硬件支持的核心封装，主要用于为 Kernel 提供执行期的硬件上下文。
+
+KernelContext 的设计类似于 OpParam，两者均没有基类；对于 KernelContext，其假定是，不同的硬件间的接口和逻辑可能完全不同，比如 kARM 和 kCUDA，因此不设定基类，也不需要提供统一的接口来封装不同硬件行为。
+
+不同硬件的 KernelContext 直接与该硬件对应的 Kernel 对接。
+
+KernelContext 的行为可以被 MIR 在分析期确定和调度。
+
+注意事项：
+
+1. 由于是执行期概念，KernelContext 也需要注意性能和轻量化
+2. 移动端部署时只会部署执行期，因此 MIR 和 KernelContext 会拆开，因此 KernelContext 相应的设置需要能够序列化到 ProgramDesc 中，以便执行期载入和执行
+
+## 扩展硬件后端
+
+### 扩展现有的硬件后端
+
+主要是扩充 Op 和 Kernel 的工作，如果需要 fuse，则参考 MIR 章节，增加相应的fuse pass便可，具体地，可以参考
+
+- [fc_op](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/operators/fc_op.h) 实现类似的 Op
+- [fc_compute](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/kernels/arm/fc_compute.h) 实现类似的 Kernel
+- [fc_fuse_pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/fusion/fc_fuse_pass.h) 实现fuse逻辑，并注册到 [optimizer](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/optimizer.h)
+
+### 扩展全新硬件后端
+
+需要额外扩充如下模块，让框架能够支撑硬件执行：
+
+- TypeSystem，需要扩充其中相关的 type
+  - 相关 [enum](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/api/paddle_place.h#L44)
+- MIR，需要扩展其中的 type cast 相关的 pass
+  - [TargetType cast pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/type_target_cast_pass.cc) 用于拷贝不同硬件上的tensor
+  - [Data layout cast pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/type_target_cast_pass.h) 用于转化不同的 data layout
+  - [Precision cast pass](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/mir/type_precision_cast_pass.h) 用于转化不同 tensor 的量化精度
+- KernelContext，具体地可以参考
+  - [ARM context](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.0.0-beta1/lite/core/context.h#L91)
+  - 需要注意的是，硬件 context 的接口只服务于该硬件的 kernel
+  - context 有分析期和执行期两个阶段，如果分析期没有特殊的优化，则无需考虑；否则，需要注意将分析期的信息整理并序列化到离线模型中，用于执行期直接加载。
diff --git a/docs/develop_guides/for-developer.md b/docs/develop_guides/for-developer.md
new file mode 100644
index 0000000000000000000000000000000000000000..fc7bd412ee5091552c7244a621f9e298496973a4
--- /dev/null
+++ b/docs/develop_guides/for-developer.md
@@ -0,0 +1,14 @@
+# 开发基础须知
+
+可以参考 [Paddle 开发者文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/advanced_usage/development/contribute_to_paddle/local_dev_guide.html)。
+
+## 提交PR
+
+需要在 commit message 里加上 `test=develop` 才能触发 CI
+
+## 版本发布检查清单
+
+1. 所有 feature 梳理，确认状态
+2. 所有 QA 测试结果梳理，确认版本可靠
+3. Release note 确认 review 通过
+4. 确认需要 release 的 binary 编译完毕
diff --git a/docs/develop_guides/index.rst b/docs/develop_guides/index.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/docs/index.rst b/docs/index.rst
index d7359f1d0508f8e85824f450ca07f095d047f90c..5e8cb6b2148af4a7f68faf602bdb617743e48e1b 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -13,10 +13,12 @@ Welcome to Paddle-Lite's documentation!
 
   introduction/tech_highlights
   introduction/architecture
+  introduction/support_hardware
+  introduction/support_operation_list
 
 .. toctree::
   :maxdepth: 1
-  :caption: Benchmark数据和方法
+  :caption: Benchmark
   :name: sec-benchmark
   
   benchmark/benchmark
@@ -24,46 +26,67 @@ Welcome to Paddle-Lite's documentation!
 
 .. toctree::
   :maxdepth: 1
-  :caption: 安装
-  :name: sec-install
-
-  installation/source_compile
-
-.. toctree::
-  :maxdepth: 1
-  :caption: 使用指南
+  :caption: 使用方法
   :name: sec-user-guides
 
+  user_guides/tutorial
+  user_guides/release_lib
+  user_guides/source_compile
+  user_guides/x2paddle
   user_guides/model_optimize_tool
+  user_guides/post_quant_with_data
+  user_guides/post_quant_no_data
+  user_guides/model_quantization
+  user_guides/debug
   user_guides/library_tailoring
-  user_guides/cuda
-  user_guides/opencl
 
 .. toctree::
   :maxdepth: 1
-  :caption: 进阶使用指南
-
-  advanced_user_guides/support_operation_list
-  advanced_user_guides/add_operation
-  advanced_user_guides/add_layout
-  advanced_user_guides/model_quantization
-  advanced_user_guides/add_new_pass
-  advanced_user_guides/x86
+  :caption: 部署示例
+  :name: sec-demo_guides
+
+  demo_guides/cpp_demo
+  demo_guides/java_demo
+  demo_guides/android_app_demo
+  demo_guides/ios_app_demo
+  demo_guides/x86
+  demo_guides/cuda
+  demo_guides/opencl
+  demo_guides/fpga
+  demo_guides/npu
   
 .. toctree::
   :maxdepth: 1
-  :caption: 开发者文档
+  :caption: API文档
+
+  api_reference/cxx_api_doc
+  api_reference/java_api_doc
+  api_reference/python_api_doc
+  api_reference/cv
+
+.. toctree::
+  :maxdepth: 1
+  :caption: 开发者贡献
+
+  develop_guides/for-developer
+  develop_guides/architecture-intro
+  develop_guides/add_operation
+  develop_guides/add_layout
+  develop_guides/add_new_pass
 
 .. toctree::
   :maxdepth: 1
-  :caption: API文档
+  :caption: Roadmap
+  :name: sec-roadmap
 
-  api_reference/cxx_api_doc
+  introduction/roadmap
 
 .. toctree::
   :maxdepth: 1
   :caption: FAQ
 
+  introduction/faq
+
 .. toctree::
   :maxdepth: 1
   :caption: paddle-mobile
diff --git a/docs/installation/library.md b/docs/installation/library.md
deleted file mode 100644
index ef2f8fdb18ade439d620b348738cbb752d5bd8b6..0000000000000000000000000000000000000000
--- a/docs/installation/library.md
+++ /dev/null
@@ -1,61 +0,0 @@
-
-# 预测库说明
-
-Paddle-Lite的编译结果为预测库文件（包括静态库和动态库），具体编译过程参考[源码编译](./source_compile)。
-
-Lite预测库分为**基础预测库**和**全量预测库**：基础预测库只打包了基础模型需要的基础算子，预测库体积较小；全量预测库打包了所有的Lite算子，可以支持更多的模型，但是预测库的体积也更大。 编译时由编译选项 `build_extra`(默认为OFF)控制，`--build_extra=OFF`时编译基础预测库，`--build_extra=ON`时编译全量的预测库。
-
-## 基础预测库
-
-### 编译方法
-编译时设置`--build_extra=OFF` (默认值) 或不指定即可编译出基础预测库。例如：
-
-```
-./lite/tools/build.sh  --arm_os=android  --arm_abi=armv8 --arm_lang=gcc  --android_stl=c++_static  tiny_publish
-```
-
-### 基础预测库支持的功能
-
-（1）支持基础CV模型
-
-（2）支持基础的in8量化模型
-
-（3）支持[benchmark测试](../benchmark/benchmark)
-
-
-### 基础预测库支持的基础模型：
-
-1. fluid基础模型（paddle model 提供的基础模型9个）
-
-```
-mobileNetV1     mnasnet     yolov3   ssd_mobilenetv1    shufflenet_v2
-mobileNetV2     resnet50    unet     squeezenet_v11
-```
-
-2. int8量化模型模型
-
-```
-mobilenet_v1   mobilenet_v2   resnet50
-```
-
-### 特点
-  轻量级预测库，体积更小，支持常用的基础模型。
-
-
-
-## 全量预测库
-
-### 编译方法
-编译时设置`--build_extra=ON` 即可编译出全量预测库。例如：
-
-```
-./lite/tools/build.sh  --arm_os=android  --arm_abi=armv8 --arm_lang=gcc  --android_stl=c++_static --build_extra=ON tiny_publish
-```
-### 全量预测库功能
-
-（1） 基础预测库所有功能
-
-（2）支持所有Paddle-Lite中注册的所有算子
-
-### 特点
-  支持更多的硬件平台和算子，可以支持更多模型但体量更大。
diff --git a/docs/introduction/faq.md b/docs/introduction/faq.md
new file mode 100644
index 0000000000000000000000000000000000000000..768b92a31b42934d454bfa3afbee6f8dba1ef462
--- /dev/null
+++ b/docs/introduction/faq.md
@@ -0,0 +1,8 @@
+# FAQ 常见问题
+
+问题或建议可以发Issue，为加快问题解决效率，可先检索是否有类似问题，我们也会及时解答！
+欢迎加入Paddle-Lite百度官方QQ群：696965088
+
+1. 在Host端采用交叉编译方式编译PaddleLite，将编译后的libpaddle_light_api_shared.so和可执行程序放到板卡上运行，出现了如下图所示的错误，怎么解决？ 
+![host_target_compiling_env_miss_matched](https://user-images.githubusercontent.com/9973393/75761527-31b8b700-5d74-11ea-8a9a-0bc0253ee003.png)
+- 原因是Host端的交叉编译环境与Target端板卡的运行环境不一致，导致libpaddle_light_api_shared.so链接的GLIBC库高于板卡环境的GLIBC库。目前有四种解决办法（为了保证编译环境与官方一致，推荐第一种方式）：1）在Host端，参考[源码编译](../user_guides/source_compile)中的Docker方式重新编译libpaddle_light_api_shared.so；2）在Host端，使用与Target端版本一致的ARM GCC和GLIBC库重新编译libpaddle_light_api_shared.so；3）在Target端板卡上，参考[源码编译](../user_guides/source_compile)中的ARM Linux本地编译方式重新编译libpaddle_light_api_shared.so；4）在Target端板卡上，将GLIBC库升级到和Host端一致的版本，即GLIBC2.27。
diff --git a/docs/introduction/roadmap.md b/docs/introduction/roadmap.md
new file mode 100644
index 0000000000000000000000000000000000000000..0c5b5366041ff4cf406fe5d9d67833925c7795f8
--- /dev/null
+++ b/docs/introduction/roadmap.md
@@ -0,0 +1,32 @@
+# Road map
+
+这篇文档会介绍 Paddle-Lite 近期对外的开源版本和计划。
+
+其中包含的 feature 为最小集合，按最终发布的版本为准。
+
+
+## 2.0.0-beta1-prerelease
+
+预计发布 *2019-8-26 ~ 2days*
+
+- 完善编译和 benchmark 文档
+- 增加第三方依赖代码的离线下载功能，加速编译过程
+- 去掉 `tiny_publish` 模式下无关的第三方代码下载，可以不依赖任何第三方
+
+## 2.0.0-beta1
+
+预计发布 *2019-9-1~2days*
+
+- `model_optimize_tool` 从 ARM 上执行修改为 Host 上执行，只从 kernel 分布来确定计算图优化；后续硬件针对优化会发布新的工具；
+- Paddle 模型支持参数 composed  的格式
+- 增加分层编译来控制常用模型的部署库的大小，分两个模式 `basic`, `extra`；默认 `basic` 模式只发布核心的op 和kernel；将控制流相关的Op和kernel 折叠进 `extra` 按需编译
+- 增加 INT8 量化，从 PaddleSlim 训练到 PaddleLite 部署完整案例
+- 支持内存中加载模型，以支持 APP 的简易加密
+
+## 2.3
+
+[v2.3 project](https://github.com/PaddlePaddle/Paddle-Lite/milestone/3?closed=1)
+
+## 2.6
+
+[v2.6 project](https://github.com/PaddlePaddle/Paddle-Lite/milestones/v2.6)
diff --git a/docs/introduction/support_hardware.md b/docs/introduction/support_hardware.md
new file mode 100644
index 0000000000000000000000000000000000000000..b1a6823d26d4fe8838afee00732707608b836599
--- /dev/null
+++ b/docs/introduction/support_hardware.md
@@ -0,0 +1,45 @@
+
+# 支持硬件
+
+
+## ARM CPU
+Paddle Lite支持[ARM Cortex-A系列处理器](https://en.wikipedia.org/wiki/ARM_Cortex-A)，支持列表如下:
+### 32bit(ARMv7a)
+- Cortex-A5
+- Cortex-A7
+- Cortex-A8
+- Cortex-A9
+- Cortex-A12
+- Cortex-A15
+- Cortex-A17(RK3288)
+- Cortex-A32
+### 64bit(ARMv7a, ARMv8a)
+- Cortex-A35
+- Cortex-A53(树莓派3)
+- Cortex-A55
+- Cortex-A57(Nvidia tx1，Nvidia tx2， 高通810等)
+- Cortex-A72(麒麟95X，高通820, RK3399，树莓派4等)
+- Cortex-A73(麒麟960，麒麟970，高通835, 联发科X30等)
+- Cortex-A75(高通845等)
+- Cortex-A76(麒麟980，麒麟990，高通855，高通730，联发科G90等）
+- Cortex-A77
+- ARMv8-A compatible(Apple A系列处理器, Nvidia tegra, Qualcomm Kryo, Falkor, Samsung Mongoose)
+
+## 移动端GPU
+Paddle Lite支持移动端GPU和Nvidia端上GPU设备，支持列表如下：
+- ARM Mali G 系列
+- Qualcomm Adreno 系列
+- Nvida tegra系列: tx1, tx2, nano, xavier
+
+## NPU
+Paddle Lite支持NPU，支持列表如下：
+- 华为达芬奇架构NPU
+
+## FPGA
+Paddle Lite支持FPGA，支持列表如下：
+- 百度Edgeboard系列：ZU9, ZU5, ZU3
+
+## XPU
+Paddle Lite支持XPU，支持列表如下：
+- 百度昆仑818-100芯片
+- 百度昆仑818-300芯片
diff --git a/docs/advanced_user_guides/support_operation_list.md b/docs/introduction/support_operation_list.md
similarity index 96%
rename from docs/advanced_user_guides/support_operation_list.md
rename to docs/introduction/support_operation_list.md
index 7c2ceb0ff819f7f1676308a33ec88f5eab820e57..7a60cf46e424dfe610a0541c9e364cf6e5d98531 100644
--- a/docs/advanced_user_guides/support_operation_list.md
+++ b/docs/introduction/support_operation_list.md
@@ -1,44 +1,31 @@
-# 支持OP列表
+# 支持OP
 
-## Ops
+## Ops （共计158个算子）
 
+### Basic Operators (默认编译的算子)
 - affine_channel
-- anchor_generator
 - arg_max
-- assign
-- assign_value
-- attention_padding_mask
-- axpy
 - batch_norm
-- beam_search
-- beam_search_decode
 - bilinear_interp
-- box_clip
 - box_coder
 - calib
-- calib_once
 - cast
-- collect_fpn_proposals
 - concat
-- conditional_block
 - conv2d
 - conv2d_transpose
-- crop
-- decode_bboxes
 - density_prior_box
 - depthwise_conv2d
-- distribute_fpn_proposals
 - dropout
 - elementwise_add
 - elementwise_div
 - elementwise_max
 - elementwise_mul
 - elementwise_sub
-- equal
 - exp
 - expand
 - fake_channel_wise_dequantize_max_abs
 - fake_dequantize_max_abs
+- fake_quantize_abs_max
 - fake_quantize_dequantize_moving_average_abs_max
 - fake_quantize_moving_average_abs_max
 - fake_quantize_range_abs_max
@@ -55,6 +42,72 @@
 - fusion_elementwise_max_activation
 - fusion_elementwise_mul_activation
 - fusion_elementwise_sub_activation
+- gelu
+- grid_sampler
+- hard_sigmoid
+- instance_norm
+- io_copy
+- io_copy_once
+- layout
+- leaky_relu
+- log
+- matmul
+- mean
+- mul
+- multiclass_nms
+- nearest_interp
+- pad2d
+- pool2d
+- prelu
+- prior_box
+- range
+- reduce_mean
+- relu
+- relu6
+- relu_clipped
+- reshape
+- reshape2
+- rsqrt
+- scale
+- search_fc
+- sequence_topk_avg_pooling
+- shuffle_channel
+- sigmoid
+- slice
+- softmax
+- softsign
+- split
+- sqrt
+- square
+- squeeze
+- squeeze2
+- stack
+- subgraph
+- swish
+- tanh
+- transpose
+- transpose2
+- unsqueeze
+- unsqueeze2
+- yolo_box
+
+### Extra Operators (打开 `--build_extra=ON`开关才会编译)
+
+- anchor_generator
+- assign
+- assign_value
+- attention_padding_mask
+- axpy
+- beam_search
+- beam_search_decode
+- box_clip
+- calib_once
+- collect_fpn_proposals
+- conditional_block
+- crop
+- decode_bboxes
+- distribute_fpn_proposals
+- equal
 - gather
 - generate_proposals
 - graph_op
@@ -62,21 +115,14 @@
 - greater_than
 - gru
 - gru_unit
-- hard_sigmoid
 - im2sequence
 - increment
-- instance_norm
-- io_copy
-- io_copy_once
 - is_empty
 - layer_norm
-- layout
 - layout_once
-- leaky_relu
 - less_equal
 - less_than
 - lod_reset
-- log
 - logical_and
 - logical_not
 - logical_or
@@ -85,37 +131,18 @@
 - lookup_table_v2
 - lrn
 - match_matrix_tensor
-- matmul
-- mean
 - merge_lod_tensor
-- mul
-- multiclass_nms
-- nearest_interp
 - negative
 - norm
-- notequal
-- pad2d
-- pool2d
+- not_equal
 - power
-- prelu
-- prior_box
-- range
 - read_from_array
 - reduce_max
-- reduce_mean
 - reduce_prod
 - reduce_sum
-- relu
-- relu6
-- relu_clipped
-- reshape
-- reshape2
 - roi_align
-- rsqrt
-- scale
 - search_aligned_mat_mul
 - search_attention_padding_mask
-- search_fc
 - search_grnn
 - search_group_padding
 - search_seq_arithmetic
@@ -130,32 +157,15 @@
 - sequence_reshape
 - sequence_reverse
 - sequence_softmax
-- sequence_topk_avg_pooling
 - shape
-- shuffle_channel
-- sigmoid
-- slice
-- softmax
-- softsign
-- split
 - split_lod_tensor
-- sqrt
-- square
-- squeeze
-- squeeze2
-- stack
-- swish
-- tanh
 - top_k
-- transpose
-- transpose2
 - uniform_random
-- unsqueeze
-- unsqueeze2
 - var_conv_2d
 - while
 - write_to_array
-- yolo_box
+
+
 
 ## Kernels
 
diff --git a/docs/user_guides/debug.md b/docs/user_guides/debug.md
new file mode 100644
index 0000000000000000000000000000000000000000..93395b25fae772954f83a1128cdb7e86c9eee994
--- /dev/null
+++ b/docs/user_guides/debug.md
@@ -0,0 +1,89 @@
+# 调试
+
+## Profiler工具
+
+Basic profiler 用于 CPU 上kernel 耗时的统计。
+
+### 开启方法:
+
+参照 [编译安装](../user_guides/source_compile) 中的**full_publish**部分进行环境配置，在 cmake 时添加 `-DLITE_WITH_PROFILE=ON` ，就可以开启相应支持。
+
+### 使用示例：
+
+在模型执行完毕后，会自动打印类似如下 profiler 的日志
+
+```
+                        kernel   average       min       max     count
+                feed/def/1/4/2         0         0         0         1
+              conv2d/def/4/1/1      1175      1175      1175         1
+              conv2d/def/4/1/1      1253      1253      1253         1
+    depthwise_conv2d/def/4/1/1       519       519       519         1
+              conv2d/def/4/1/1       721       721       721         1
+     elementwise_add/def/4/1/1        18        18        18         1
+              conv2d/def/4/1/1      2174      2174      2174         1
+    depthwise_conv2d/def/4/1/1       380       380       380         1
+              conv2d/def/4/1/1       773       773       773         1
+     elementwise_add/def/4/1/1         2         2         2         1
+              conv2d/def/4/1/1      1248      1248      1248         1
+    depthwise_conv2d/def/4/1/1       492       492       492         1
+              conv2d/def/4/1/1      1150      1150      1150         1
+     elementwise_add/def/4/1/1        33        33        33         1
+     elementwise_add/def/4/1/1         3         3         3         1
+              conv2d/def/4/1/1      1254      1254      1254         1
+    depthwise_conv2d/def/4/1/1       126       126       126         1
+```
+
+## Debug工具
+
+**Lite Model Debug Tool** 是用来检查Paddle-Lite框架与Paddle-Fluid框架运行时tensor(包括variable与weight)之间diff信息的基础工具。
+
+### 编译方法:
+
+1. 参照 [编译安装](../user_guides/source_compile) 中的**full_publish**部分进行环境配置和编译。
+2. 在生成的`build`目录下，执行`make lite_model_debug_tool`，`lite_model_debug_tool`产出在编译目录的`lite/tools/debug`目录下。
+
+### 工作流程:
+
+1. 运行 `/bin/bash check_model.sh --model_dir=<your_model_path> --build_root_dir=<your_cmake_root_dir> debug_cpp_stage` 获得模型在Paddle-Lite框架下的运行拓扑信息、varibles信息和weights信息。运行后拓扑信息将会存储在默认名为 `topo_file.txt` 的文件中，variables和weights信息将会存储在默认名为 `tensor_cpp.txt` 的文件中。
+2. 运行 `/bin/bash check_model.sh --model_dir=<your_model_path> --build_root_dir=<your_cmake_root_dir> debug_py_stage`执行fluid框架预测以获取相同模型在fluid框架下的variable与weight信息(注意：我们使用fluid的python api运行fluid模型，因此您在运行此步之前应确保已正确安装fluid的python api)。然后debug tool将会自动比较Paddle-Lite框架输出的信息和Paddle-Fluid框架输出的信息来检查是否存在运行时diff。 执行Paddle-Fluid框架，输出的信息将会存储在默认名为 `tensor_py.txt` 的文件中，相应的diff信息将会存储在默认名为 `diff.txt`的文件中(默认情况下，只会输出执行拓扑序中第一个有diff的variable相关的信息)。
+
+### 注意事项:
+
+1. 输出的结果是在**执行完一次预测后**输出的相应变量/权重的最终值，因此如果您在预测过程进行过诸如变量复用/子图融合等优化方法，则相应的输出可能会出现偏差。
+2. 默认情况下debug tools将以全1作为输入进行比对。
+3. 默认情况下，为了保证与Paddle-Fluid框架的结果可比对，debug tool将会禁用掉所有的Paddle-Lite的优化策略。
+4. Paddle-Lite框架的执行环境由与您的编译选项有关，比如您开启了LITE_WITH_ARM编译选项，那debug tool的`debug_cpp_stage`也需要在ARM平台下运行。
+
+### Diff信息输出：
+
+如果debug tool检测到diff信息，那么在`diff.txt`中将会输出类似以下结构信息
+
+```c++
+>>>>>>>>>>>>>>>>>>DIFF VARIABLE: dropout_0.tmp_0<<<<<<<<<<<<<<<<<<<
+dropout	(X:pool2d_7.tmp_0)	(Mask:dropout_0.tmp_1 Out:dropout_0.tmp_0)
+--------------- Tensor File info ---------------
+pool2d_7.tmp_0	{1,1536,1,1}	0.749892 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0150336 0.621641 0.147099 0.636727 0.0 0.0 0.00410917 0.784708 0.0 0.0704846 0.233599 0.840123 0.239201 0.112878 0.0 0.155352 0.306906 0.0 0.0 0.860938 0.221037 0.787316 0.256585 ... 
+dropout_0.tmp_0	{1,1536,1,1}	0.749892 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0150336 0.621641 0.147099 0.636727 0.0 0.0 0.00410917 0.784708 0.0 0.0704846 0.233599 0.840123 0.239201 0.112878 0.0 0.155352 0.306906 0.0 0.0 0.860938 0.221037 0.787316 0.256585 ...
+--------------- Fluid Tensor info ---------------
+pool2d_7.tmp_0	{1,1536,1,1}	0.7498912 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.015033395 0.6216395 0.14709876 0.63672537 0.0 0.0 0.0041093696 0.7847073 0.0 0.07048465 0.23359808 0.8401219 0.23919891 0.1128789 0.0 0.1553514 0.3069055 0.0 0.0 0.8609365 0.22103554 ...
+dropout_0.tmp_0	{1,1536,1,1}	0.599913 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.012026716 0.4973116 0.117679015 0.5093803 0.0 0.0 0.0032874958 0.62776583 0.0 0.056387722 0.18687847 0.67209756 0.19135913 0.090303116 0.0 0.12428112 0.2455244 0.0 0.0 0.68874925 ... 
+```
+
+其中第二行为op相关信息，标明了执行哪个op出现了diff及其对应的输入输出变量名。Tensor File info为Paddle-Lite框架的输出信息，而Fluid Tensor info为Paddle-Fluid框架的相应输出信息。
+示例中的`dropout_0.tmp_1`没有相应的tensor信息是因为工具检测到其在预测的后序流程中未被使用，因此不会对预测结果造成影响，从而将其自动屏蔽掉以保证输出尽量简洁。
+
+### 其他选项：
+
+| Option                      | Description                                                  |
+| --------------------------- | ------------------------------------------------------------ |
+| --input_file                | 输入文件名，不同field以逗号分隔，相同field内以空格分隔, 只有文件中的第一行输入信息会被使用. 如果您不指定input_file，那么所有输入将会被置为1。注意：`debug_py_stage`目前不支持多field输入。 |
+| --cpp_topo_file             | 存储运行时拓扑信息，由`debug_cpp_stage`写入并且由`debug_py_stage`读取使用。 默认为`topo_file.txt` 。 |
+| --cpp_tensor_file           | 存储`debug_cpp_stage` 在运行拓扑序下的输出信息，默认为 `tensor_cpp.txt` 。 |
+| --tensor_names              | 如果此选项不为空，那么只输出由此选项中指定名字的variable/weight信息，名字间用逗号分隔。 |
+| --tensor_output_length      | 输出数据的长度，默认为全部输出。                             |
+| --py_threshold              | 判断diff发生的阈值，默认为 `1e-5` 。                         |
+| --py_tensor_file            | 存储`debug_py_stage` 在运行拓扑序下的输出信息，默认为`tensor_py.txt`. |
+| --py_output_file            | diff信息的存储文件，默认为`diff.txt`。                       |
+| --py_only_output_first_diff | 是否只输出运行时拓扑序中第一个有diff的var/op信息，默认为true |
+
+您可以参考 `check_model.sh` 脚本中的代码以获得更多细节.
diff --git a/docs/user_guides/index.rst b/docs/user_guides/index.rst
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/docs/user_guides/library.md b/docs/user_guides/library.md
new file mode 100644
index 0000000000000000000000000000000000000000..20f16322c67cc9d10d2f667fa2ca7bceb83e338b
--- /dev/null
+++ b/docs/user_guides/library.md
@@ -0,0 +1,57 @@
+
+# `build_extra`参数说明：
+
+Lite预测库分为**基础预测库**和**全量预测库(with_extra)**：基础预测库只包含基础CV算子（OP），体积较小；全量预测库包含所有Lite算子，体积较大，支持模型较多。
+
+编译时由编译选项 `build_extra`(默认为OFF)控制，`--build_extra=OFF`时编译**基础预测库**，`--build_extra=ON`时编译**全量预测库**。
+
+## 基础预测库( [基础OP列表](../advanced_user_guides/support_operation_list.html#basic-operators) )
+
+
+### 支持功能
+
+（1）87个[基础OP](../advanced_user_guides/support_operation_list.html#basic-operators)       （2）9个基础模型       （3）3个in8量化模型
+
+
+### 支持的模型
+
+1. fluid基础模型（来源：[paddle-models](https://github.com/PaddlePaddle/models) ）
+
+```
+mobilenetV1     mnasnet     yolov3   ssd_mobilenetv1    shufflenet_v2
+mobilenetV2     resnet50    unet     squeezenet_v11
+```
+
+2. int8量化模型
+
+```
+mobilenet_v1   mobilenet_v2   resnet50
+```
+
+### 特点
+  轻量级预测库，体积更小，支持常用模型。
+
+### 编译方法
+编译时设置`--build_extra=OFF` (默认值) 编译出基础预测库。例如：
+
+```
+./lite/tools/build.sh  --arm_os=android  --arm_abi=armv8 --arm_lang=gcc  --android_stl=c++_static  tiny_publish
+```
+
+
+## 全量预测库( [OP列表](../advanced_user_guides/support_operation_list.html#op) )
+
+
+### 支持功能
+
+   Paddle-Lite中的全量算子（ [基础OP](../advanced_user_guides/support_operation_list.html#basic-operators) + [Extra OP](../advanced_user_guides/support_operation_list.html#extra-operators-build-extra-on) ）
+
+### 特点
+   包含更多算子、支持更多模型，但体量更大。
+
+### 编译方法
+设置`--build_extra=ON` 可编译出全量预测库。例如：
+
+```
+./lite/tools/build.sh  --arm_os=android  --arm_abi=armv8 --arm_lang=gcc  --android_stl=c++_static --build_extra=ON tiny_publish
+```
diff --git a/docs/user_guides/library_tailoring.md b/docs/user_guides/library_tailoring.md
index 5ba12cf819945ab2f182f672a2c96123bc12e070..cf0641b7314f112e9cb7ac4f0a9094bdbdaa7ca6 100644
--- a/docs/user_guides/library_tailoring.md
+++ b/docs/user_guides/library_tailoring.md
@@ -1,5 +1,5 @@
 
-# 裁剪预测库方法
+# 裁剪预测库
 
 Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编译会将所有已注册的operator打包到预测库中，造成库文件体积膨胀；**裁剪预测库**能针对具体的模型，只打包优化后该模型需要的operator，有效降低预测库文件大小。
 
@@ -39,7 +39,7 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编
 例如：
 
 ```bash
-./lite/tools/build.sh   --arm_os=android   --arm_abi=armv7   --arm_lang=gcc   --android_stl=c++_static   --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB full_publish
+./lite/tools/build.sh   --arm_os=android   --arm_abi=armv7   --arm_lang=gcc   --android_stl=c++_static   --build_extra=ON --build_tailor=ON --opt_model_dir=../mobilenet_v1NB tiny_publish
 ```
 **注意**：上面命令中的`../mobilenet_v1NB`是第1步得到的转化模型的输出路径
 
@@ -88,9 +88,6 @@ Paddle-Lite支持**根据模型裁剪预测库**功能。Paddle-Lite的一般编
 #include <stdio.h>
 #include <vector>
 #include "paddle_api.h"          // NOLINT
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
-#include "paddle_use_passes.h"   // NOLINT
 
 using namespace paddle::lite_api;  // NOLINT
 
@@ -182,4 +179,4 @@ int main(int argc, char** argv) {
 1. 模型集合**必须**均为combined参数模型或均为非combined参数模型。
 2. 使用非combined参数模型时，模型拓扑文件名应为`__model__`，使用非combined参数模型时，集合中各模型的拓扑与参数名应相同，分别由`--model_filename`和`--param_filename`指定。
 3. 模型集合**必须**均为INT8量化模型或均为非INT8量化模型。
-4. 需要使用Paddle-Lite 最新版本（release/v2.1.0之后）代码编译出的model_optimize_tool。
+4. 需要使用Paddle-Lite  `release/v2.1.0`之后版本代码编译出的模型优化工具。
diff --git a/docs/user_guides/model_optimize_tool.md b/docs/user_guides/model_optimize_tool.md
index fccc6d8b23c78474257d11399d121816f57fc422..c3d5f527048519e851cc8b9e785dc39668e971a4 100644
--- a/docs/user_guides/model_optimize_tool.md
+++ b/docs/user_guides/model_optimize_tool.md
@@ -1,20 +1,26 @@
 
-# 模型转化方法
+# 模型优化工具 opt
 
-Lite架构在预测过程中表现出来的高性能得益于其丰富的优化组件，其中包括量化、子图融合、混合调度、Kernel优选等等策略。为了使优化过程更加方便易用，我们提供了**opt**来自动完成优化步骤，输出一个轻量的、最优的可执行模型。具体使用方法介绍如下：
+Paddle-Lite 提供了多种策略来自动优化原始的训练模型，其中包括量化、子图融合、混合调度、Kernel优选等等方法。为了使优化过程更加方便易用，我们提供了**opt** 工具来自动完成优化步骤，输出一个轻量的、最优的可执行模型。
 
-**注意**：release/v2.2.0之前的模型转化工具名称为`model_optimize_tool`，从release/v2.3开始模型转化工具名称修改为`opt`
+具体使用方法介绍如下：
+
+**注意**：`v2.2.0` 之前的模型转化工具名称为`model_optimize_tool`，从 `v2.3` 开始模型转化工具名称修改为 `opt`
 
 ## 准备opt
 当前获得opt方法有三种：
 
-1. 我们提供当前develop分支编译结果下载：[opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt)、[opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac)
-release/v2.2.0之前版本的model_optimize_tool: [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool)、[model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac)
-
-2. 可以进入Paddle-Lite Github仓库的[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases)，选择release版本下载对应的转化工具`opt`    
+1. **推荐！** 可以进入Paddle-Lite Github仓库的[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases)，选择release版本下载对应的转化工具`opt`    
    (release/v2.2.0之前的转化工具为model_optimize_tool、release/v2.3.0之后为opt)
+2. 本文提供`release/v2.3`和`release/v2.2.0`版本的优化工具下载
+
+|版本 | Linux | MacOS|
+|---|---|---|
+| `release/v2.3`| [opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) |
+|`release/v2.2.0`  | [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool) | [model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac) |
 
-3. 可以下载Paddle-Lite源码，从源码编译出opt工具
+
+3. 如果 release 列表里的工具不符合您的环境，可以下载Paddle-Lite 源码，源码编译出opt工具
 ```bash
 git clone https://github.com/PaddlePaddle/Paddle-Lite.git
 cd Paddle-Lite
@@ -22,11 +28,11 @@ git checkout <release-version-tag>
 ./lite/tools/build.sh build_optimize_tool
 ```
 编译结果位于`Paddle-Lite/build.opt/lite/api/opt`
-**注意**：从源码编译opt前需要先[安装Paddle-Lite的开发环境](../installation/source_compile)。
+**注意**：从源码编译opt前需要先[安装Paddle-Lite的开发环境](source_compile)。
 
 ## 使用opt
 
-opt是x86平台上的可执行文件，需要在PC端运行：包括Linux终端和Mac终端。
+opt是 x86 平台上的可执行文件，需要在PC端运行：支持Linux终端和Mac终端。
 
 ### 帮助信息
  执行opt时不加入任何输入选项，会输出帮助信息，提示当前支持的选项：
@@ -36,7 +42,10 @@ opt是x86平台上的可执行文件，需要在PC端运行：包括Linux终端
 ![](https://paddlelite-data.bj.bcebos.com/doc_images/1.png)
 
 ### 功能一：转化模型为Paddle-Lite格式
-opt可以将PaddlePaddle支持的模型转化为Paddle-Lite支持的模型格式，期间执行的操作包括：将protobuf格式的模型文件转化为naive_buffer格式的模型文件，有效降低模型体积；执行“量化、子图融合、混合调度、Kernel优选”等图优化操作，提升其在Paddle-Lite上的运行速度、内存占用等性能指标。
+opt可以将PaddlePaddle的部署模型格式转化为Paddle-Lite 支持的模型格式，期间执行的操作包括：
+
+- 将protobuf格式的模型文件转化为naive_buffer格式的模型文件，有效降低模型体积
+- 执行“量化、子图融合、混合调度、Kernel优选”等图优化操作，提升其在Paddle-Lite上的运行速度、内存占用等效果
 
 模型优化过程：
 
@@ -54,7 +63,10 @@ PaddlePaddle模型有两种保存格式：
 **使用示例**：转化`mobilenet_v1`模型
 
 ```
-./opt --model_dir=./mobilenet_v1 --valid_targets=arm --optimize_out_type=naive_buffer --optimize_out=mobilenet_v1_opt
+./opt --model_dir=./mobilenet_v1 \
+      --valid_targets=arm \
+      --optimize_out_type=naive_buffer \
+      --optimize_out=mobilenet_v1_opt
 ```
 以上命令可以将`mobilenet_v1`模型转化为arm硬件平台、naive_buffer格式的Paddle_Lite支持模型，优化后的模型文件为`mobilenet_v1_opt.nb`，转化结果如下图所示：
 
@@ -71,7 +83,6 @@ PaddlePaddle模型有两种保存格式：
     --optimize_out_type=(protobuf|naive_buffer) \
     --optimize_out=<output_optimize_model_dir> \
     --valid_targets=(arm|opencl|x86|npu|xpu) \
-    --prefer_int8_kernel=(true|false) \
     --record_tailoring_info =(true|false)
 ```
 
@@ -83,12 +94,12 @@ PaddlePaddle模型有两种保存格式：
 | --optimize_out_type | 输出模型类型，目前支持两种类型：protobuf和naive_buffer，其中naive_buffer是一种更轻量级的序列化/反序列化实现。若您需要在mobile端执行模型预测，请将此选项设置为naive_buffer。默认为protobuf。 |
 | --optimize_out      | 优化模型的输出路径。                                         |
 | --valid_targets     | 指定模型可执行的backend，默认为arm。目前可支持x86、arm、opencl、npu、xpu，可以同时指定多个backend(以空格分隔)，Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU（Kirin 810/990 Soc搭载的达芬奇架构NPU），应当设置为npu, arm。 |
-| --prefer_int8_kernel | 若待优化模型为int8量化模型（如量化训练得到的量化模型），则设置该选项为true以使用int8内核函数进行推理加速，默认为false。                          |
 | --record_tailoring_info | 当使用 [根据模型裁剪库文件](./library_tailoring.html) 功能时，则设置该选项为true，以记录优化后模型含有的kernel和OP信息，默认为false。 |
 
 * 如果待优化的fluid模型是非combined形式，请设置`--model_dir`，忽略`--model_file`和`--param_file`。
 * 如果待优化的fluid模型是combined形式，请设置`--model_file`和`--param_file`，忽略`--model_dir`。
-* 优化后的模型包括__model__.nb和param.nb文件。
+* 优化后的模型为以`.nb`名称结尾的单个文件。
+* 删除`prefer_int8_kernel`的输入参数，`opt`自动判别是否是量化模型，进行相应的优化操作。
 
 ### 功能二：统计模型算子信息、判断是否支持
 
@@ -121,14 +132,14 @@ opt可以统计并打印出model中的算子信息、判断Paddle-Lite是否支
 **背景**：如果想用Paddle-Lite运行第三方来源（tensorflow、caffe、onnx）模型，一般需要经过两次转化。即使用x2paddle工具将第三方模型转化为PaddlePaddle格式，再使用opt将PaddlePaddle模型转化为Padde-Lite可支持格式。
 为了简化这一过程，我们提供一键脚本，将x2paddle转化和opt转化合并：
 
-**一键转化脚本**：[auto_transform.sh](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/auto_transform.sh)
+**一键转化脚本**：[auto_transform.sh](https://github.com/PaddlePaddle/Paddle-Lite/blob/release/v2.3/lite/tools/auto_transform.sh)
 
 
-**环境要求**：使用`auto_transform.sh`脚本转化第三方模型时，需要先安装x2paddle环境，请参考[x2paddle环境安装方法](https://github.com/PaddlePaddle/X2Paddle#环境依赖) 安装x2paddle和其环境依赖项。
+**环境要求**：使用`auto_transform.sh`脚本转化第三方模型时，需要先安装x2paddle环境，请参考[x2paddle环境安装方法](https://github.com/PaddlePaddle/X2Paddle#环境依赖) 安装x2paddle和x2paddle依赖项(tensorflow、caffe等)。
 
 **使用方法**：
 
-（1）打印帮助帮助信息：` ./auto_transform.sh`
+（1）打印帮助帮助信息：` sh ./auto_transform.sh`
 
 （2）转化模型方法
 
@@ -138,7 +149,7 @@ USAGE:
     tranform model from tensorflow/caffe/onnx form into paddle-lite naive-buffer form.
 ----------------------------------------
 example:
-    ./auto_transform.sh --framework=tensorflow --model=tf_model.pb --optimize_out=opt_model_result
+    sh ./auto_transform.sh --framework=tensorflow --model=tf_model.pb --optimize_out=opt_model_result
 ----------------------------------------
 Arguments about x2paddle:
     --framework=(tensorflow|caffe|onnx);
diff --git a/docs/advanced_user_guides/model_quantization.md b/docs/user_guides/model_quantization.md
similarity index 78%
rename from docs/advanced_user_guides/model_quantization.md
rename to docs/user_guides/model_quantization.md
index 7d781ba9904400c26b64aed5f5dc764ecc5b24fa..cf506cfa61e3942452ddaf1218d9d55c2fffa3fc 100644
--- a/docs/advanced_user_guides/model_quantization.md
+++ b/docs/user_guides/model_quantization.md
@@ -1,21 +1,38 @@
-# 模型量化
+# 模型量化-量化训练
 
-本文主要介绍使用Paddle-Lite加载PaddlePaddle产出的量化模型，并进行推理执行。我们以MobileNetV1模型为示例，首先介绍准备量化模型，然后介绍部署执行。
+本文主要介绍使用Paddle-Lite加载PaddlePaddle产出的量化模型，并进行推理执行。我们以MobileNetV1模型为示例，首先说明产出量化模型，然后说明预测部署。
 
-## 准备量化模型
+## 1 简介
 
-PaddlePaddle使用量化训练和训练后量化两种方法将FP32模型量化成Int8模型，下面分别介绍两种方法如何产出量化模型。
+量化训练是基于大量训练数据，对训练好的预测模型进行量化。该方法使用模拟量化的思想，在训练阶段更新权重，实现减小量化误差。
 
-### 量化训练
+使用条件：
+* 有预训练模型
+* 有较多训练数据
+
+使用步骤：
+* 产出量化模型：使用PaddlePaddle调用量化训练接口，产出量化模型
+* 量化模型预测：使用PaddleLite加载量化模型进行预测推理
+
+优点：
+* 减小计算量、降低计算内存、减小模型大小
+* 模型精度受量化影响小
+
+缺点：
+* 使用条件较苛刻，使用门槛稍高
+
+建议首先使用“有校准数据训练后量化”对模型进行量化，然后使用使用量化模型进行预测。如果该量化模型的精度达不到要求，再使用“量化训练”。
+
+
+## 2 产出量化模型
 
 目前，PaddlePaddle框架的量化训练主要针对卷积层（包括二维卷积和Depthwise卷积）、和全连接层，对应算子是conv2d、depthwise_conv2d和mul，更多量化训练的原理请参考[文档](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/tutorial.md#1-quantization-aware-training%E9%87%8F%E5%8C%96%E4%BB%8B%E7%BB%8D)。Paddle-Lite支持运行PaddlePaddle框架量化训练产出的模型，可以进一步加快模型在移动端的执行速度。
 
 温馨提示：如果您是初次接触PaddlePaddle框架，建议首先学习[新人入门](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)和[使用指南](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/index_cn.html)。
 
-
 您可以选择下载训练好的量化模型，或者使用PaddleSlim模型压缩工具训练得到量化模型。
 
-#### 下载量化模型
+### 下载量化模型
 
 官方发布了[MobileNetV1量化模型](https://paddle-inference-dist.bj.bcebos.com/int8%2Fpretrain%2Fmobilenet_v1_quant%2Ffloat.zip)，直接下载到本地。
 
@@ -23,9 +40,9 @@ PaddlePaddle使用量化训练和训练后量化两种方法将FP32模型量化
 wget https://paddle-inference-dist.bj.bcebos.com/int8%2Fpretrain%2Fmobilenet_v1_quant%2Ffloat.zip
 ```
 
-#### 使用PaddleSlim模型压缩工具训练量化模型
+### 使用PaddleSlim模型压缩工具训练量化模型
 
-##### 安装PaddlePaddle
+#### 安装PaddlePaddle
 
 根据操作系统、安装方式、Python版本和CUDA版本，按照[官方说明](https://paddlepaddle.org.cn/start)安装PaddlePaddle。例如：
 
@@ -39,7 +56,7 @@ Ubuntu 16.04.4 LTS操作系统，CPU版本安装:
 pip install paddlepaddle==1.6.0 -i https://mirrors.aliyun.com/pypi/simple/
 ```
 
-##### 克隆量化训练所需的代码库
+#### 克隆量化训练所需的代码库
 
 克隆[PaddlePaddle/models](https://github.com/PaddlePaddle/models)到本地，并进入models/PaddleSlim路径。
 
@@ -48,12 +65,13 @@ git clone https://github.com/PaddlePaddle/models.git
 cd models/PaddleSlim
 ```
 
-##### 数据准备
-###### 训练数据准备
+#### 准备数据和模型
+
+##### 训练数据准备
 
 参考[models/PaddleCV/image_classification](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification#data-preparation)中的数据准备教程，下载训练数据，并且保存到PaddleSlim/data路径下。
 
-###### 预训练模型准备
+##### 预训练模型准备
 
 参考/models/PaddleSlim/run.sh脚本， 从[models/PaddleCV/image_classification](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#supported-models-and-performances)下载MobileNetV1的预训练模型，并保存到PaddleSlim/pretrain路径下。
 
@@ -84,8 +102,7 @@ cd models/PaddleSlim
 
 在`compress.py`中定义了执行压缩任务需要的所有模型相关的信息，这里对几个关键的步骤进行简要介绍：
 
-###### 目标网络的定义
-
+**目标网络的定义**
 compress.py的以下代码片段定义了train program, 这里train program只有前向计算操作。
 ```python
 out = model.net(input=image, class_dim=args.class_dim)
@@ -103,7 +120,7 @@ val_program = fluid.default_main_program().clone()
 
 定义完目标网络结构，需要对其初始化，并根据需要加载预训练模型。
 
-###### 定义feed_list和fetch_list
+**定义feed_list和fetch_list**
 对于train program, 定义train_feed_list用于指定从train data reader中取的数据feed给哪些variable。定义train_fetch_list用于指定在训练时，需要在log中展示的结果。如果需要在训练过程中在log中打印accuracy信心，则将('acc_top1', acc_top1.name)添加到train_fetch_list中即可。
 ```python
 train_feed_list = [('image', image.name), ('label', label.name)]
@@ -119,7 +136,7 @@ val_feed_list = [('image', image.name), ('label', label.name)]
 val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5', acc_top5.name)]
 ```
 
-###### Compressor和量化配置文件
+**Compressor和量化配置文件**
 `compress.py`主要使用Compressor和yaml文件完成对模型的量化训练工作。Compressor类的定义如下：
 ```python
 class Compressor(object):
@@ -192,7 +209,7 @@ compressor:
 > 
 > 3）**目前，Paddle-Lite仅支持运行weight量化方式使用`abs_max`且activation量化方式使用`moving_average_abs_max`或`range_abs_max`产出的量化模型**。
 
-##### 执行int8量化训练
+#### 执行量化训练
 
 修改run.sh，即注释掉`# enable GC strategy`与`# for sensitivity filter pruning`之间的内容并打开`#for quantization`相关的脚本命令（所需打开注释的命令如下所示）。
 
@@ -214,56 +231,13 @@ python compress.py \
 * int8目录: 参数范围为int8范围且参数数据类型为int8的量化模型。
 * mobile目录：参数特点与int8目录相同且兼容paddle-mobile的量化模型（目前paddle-mobile已升级为Paddle-Lite）。
 
-### 训练后量化
-
-下面以MobileNetV1为例，介绍使用训练后量化方法产出量化模型。关于训练后量化的原理和详细使用方法，请参考[文档](https://github.com/PaddlePaddle/models/tree/develop/PaddleSlim/quant_low_level_api)。
-
-> 该示例的代码放在[models/PaddleSlim/quant_low_level_api/](https://github.com/PaddlePaddle/models/tree/develop/PaddleSlim/quant_low_level_api)目录下。如果需要执行该示例，首先clone下来[models](https://github.com/PaddlePaddle/models.git)，安装具有训练后量化功能的PaddlePaddle。因为目前Lite支持支持对conv2d、depthwise_conv2d和mul量化，所以修改[run_post_training_quanzation.sh](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/quant_low_level_api/run_post_training_quanzation.sh) 脚本，设置is_full_quantize=False，然后执行该脚本；执行结束后，量化模型保存在`mobilenetv1_int8_model`目录下。下面介绍详细步骤。
-
-1）**准备模型和校准数据**
-
-安装PaddlePaddle的develop分支编译的whl包，准备已经训练好的FP32预测模型。
-
-准备校准数据，文件结构如下。val文件夹中有100张图片，val_list.txt文件中包含图片的label。
-```bash
-samples_100
-└──val
-└──val_list.txt
-```
-
-2）**配置校准数据生成器**
-
-MobileNetV1的输入是图片和标签，所以配置读取校准数据的sample_generator，每次返回一张图片和一个标签。详细代码在[models/PaddleSlim/reader.py](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/reader.py)。
-
-3）**调用训练后量化**
-
-调用训练后量化的核心代码如下，详细代码在[post_training_quantization.py](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/quant_low_level_api/post_training_quantization.py)。
-``` python
-place = fluid.CUDAPlace(0) if args.use_gpu == "True" else fluid.CPUPlace()
-exe = fluid.Executor(place)
-sample_generator = reader.val(data_dir=args.data_path)
-
-ptq = PostTrainingQuantization(
-    executor=exe,
-    sample_generator=sample_generator,
-    model_dir=args.model_dir,
-    model_filename=args.model_filename,
-    params_filename=args.params_filename,
-    batch_size=args.batch_size,
-    batch_nums=args.batch_nums,
-    algo=args.algo,
-    is_full_quantize=args.is_full_quantize == "True")
-quantized_program = ptq.quantize()
-ptq.save_quantized_model(args.save_model_path)
-```
-
-## 使用Paddle-Lite运行量化模型推理
+## 3 使用Paddle-Lite运行量化模型推理
 
-#### 使用模型优化工具对量化模型进行优化
+### 使用模型优化工具对量化模型进行优化
 
 接下来，使用原始的量化模型生成适合在移动端直接部署的模型。
 
-参考[源码编译](../source_compile)配置编译环境，确保可以编译成功。参考[模型转化方法](../model_optimize_tool)，首先编译model_optimize_tool工具，然后执行下面命令对量化训练的模型进行优化（注意，需要自行修改model_file、param_file和optimize_out）。
+参考[源码编译](source_compile)配置编译环境，确保可以编译成功。参考[模型转化方法](model_optimize_tool)，首先编译model_optimize_tool工具，然后执行下面命令对量化训练的模型进行优化（注意，需要自行修改model_file、param_file和optimize_out）。
 ```bash
 ./model_optimize_tool                         \
 --model_file=mobilenet_v1_quant/float/model   \
@@ -271,12 +245,11 @@ ptq.save_quantized_model(args.save_model_path)
 --optimize_out_type=naive_buffer              \
 --optimize_out=mobilenet_v1_quant_opt         \
 --valid_targets=arm                           \
---prefer_int8_kernel=true
 ```
 
 如前所述，量化训练后，float目录下的模型参数范围为int8，但参数数据类型仍为float32类型，这样确实没有起到模型参数压缩的效果。但是，经过model\_optimize\_tool工具优化后对应的量化参数均会以int8类型重新存储达到参数压缩的效果，且模型结构也被优化（如进行了各种operator fuse操作）。
 
-#### 在手机端准备量化模型文件
+### 在手机端准备量化模型文件
 
 使用如下命令将mobilenet_v1_quant_opt目录下的量化模型文件导入到手机端：
 
@@ -284,9 +257,9 @@ ptq.save_quantized_model(args.save_model_path)
 adb push mobilenet_v1_quant_opt /data/local/tmp
 ```
 
-#### 使用mobilenetv1\_light\_api运行优化后的量化模型
+### 使用mobilenetv1\_light\_api运行优化后的量化模型
 
-参考[源码编译](../source_compile)配置编译环境后，在Paddle-Lite执行如下命令获取轻量级API的demo：
+参考[源码编译](source_compile)配置编译环境后，在Paddle-Lite执行如下命令获取轻量级API的demo：
 
 ```bash
 cd /Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light
@@ -314,9 +287,9 @@ Output[700]: 0.002509
 Output[800]: 0.000538
 Output[900]: 0.000969
 ```
-在C++中使用Paddle-Lite API的方法请猛戳[此处](../cpp_demo)，用户也可参考[mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc)的代码示例。
+在C++中使用Paddle-Lite API的方法请猛戳[此处](../demo_guides/cpp_demo)，用户也可参考[mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc)的代码示例。
 
-### FAQ
+## FAQ
 
 **问题**：Compiled with WITH_GPU, but no GPU found in runtime
 
diff --git a/docs/user_guides/paddle_mobile.md b/docs/user_guides/paddle_mobile.md
new file mode 100644
index 0000000000000000000000000000000000000000..43d17db7be4935b11ff0101e06e1f06998e9f532
--- /dev/null
+++ b/docs/user_guides/paddle_mobile.md
@@ -0,0 +1,7 @@
+# paddle-mobile 编译
+
+详情可以参考 [mobile/README](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/mobile)
+
+要切换 paddle-mobile 编译，cmake 需要加上 **-DWITH_PADDLE_MOBILE=ON** 开关，其余 flag 请参考上面文档添加到后面
+
+所有其他选项跟 paddle-mobile 原始操作完全一致
diff --git a/docs/user_guides/post_quant_no_data.md b/docs/user_guides/post_quant_no_data.md
new file mode 100644
index 0000000000000000000000000000000000000000..4068249ff7544f42c5f2643c971eb003836b1f59
--- /dev/null
+++ b/docs/user_guides/post_quant_no_data.md
@@ -0,0 +1,99 @@
+# 模型量化-无校准数据训练后量化
+
+本文首先简单介绍无校准数据训练后量化，然后说明产出量化模型，最好阐述量化模型预测。
+
+## 1 简介
+
+无校准数据训练后量化，将模型中特定OP的权重从FP32类型量化成INT8/16类型，可以减小预测模型的大小。使用该量化模型预测，首先将INT8/16类型的权重反量化成FP32类型，然后再进行预测。
+
+使用条件：
+* 有训练好的预测模型
+
+使用步骤：
+* 产出量化模型：使用PaddlePaddle调用无校准数据训练后量化接口，产出量化模型
+* 量化模型预测：使用PaddleLite加载量化模型进行预测推理
+
+优点：
+* 权重量化成INT16类型，模型精度不受影响，模型大小为原始的1/2
+* 权重量化成INT8类型，模型精度会受到影响，模型大小为原始的1/4
+
+缺点：
+* 暂无
+
+## 2 产出量化模型
+
+大家可以使用PaddlePaddle调用无校准数据训练后量化接口，得到量化模型。
+
+### 2.1 安装PaddlePaddle
+
+参考PaddlePaddle[官网](https://www.paddlepaddle.org.cn/install/quick)，安装PaddlePaddle CPU/GPU 1.7版本。
+
+### 2.2 准备模型
+
+准备已经训练好的FP32预测模型，即 `save_inference_model()` 保存的模型。
+
+### 2.3 调用无校准数据训练后量化
+
+对于调用无校准数据训练后量化，首先给出一个例子。
+
+```python
+from paddle.fluid.contrib.slim.quantization import WeightQuantization
+
+model_dir = path/to/fp32_model_params
+save_model_dir = path/to/save_model_path
+weight_quant = WeightQuantization(model_dir=model_dir)
+weight_quant.quantize_weight_to_int(save_model_dir=save_model_dir,
+                                    weight_bits=16,
+                                    quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'])
+```
+
+对于调用无校准数据训练后量化，以下对api接口进行详细介绍。
+
+```python
+class WeightQuantization(model_dir, model_filename=None, params_filename=None)
+```
+参数说明如下：
+* model_dir(str)：待量化模型的路径，其中保存模型文件和权重文件。
+* model_filename(str, optional)：待量化模型的模型文件名，如果模型文件名不是`__model__`，则需要使用model_filename设置模型文件名。
+* params_filename(str, optional)：待量化模型的权重文件名，如果所有权重保存成一个文件，则需要使用params_filename设置权重文件名。
+
+```python
+WeightQuantization.quantize_weight_to_int(save_model_dir,
+                                          save_model_filename=None,
+                                          save_params_filename=None,
+                                          quantizable_op_type=['conv2d', 'mul'],
+                                          weight_bits=8,
+                                          threshold_rate=0.0)
+```
+参数说明如下：
+* save_model_dir(str)：保存量化模型的路径。
+* save_model_filename(str, optional)：如果save_model_filename等于None，则模型的网络结构保存到__model__文件，如果save_model_filename不等于None，则模型的网络结构保存到特定的文件。默认为None。
+* save_params_filename(str, optional)：如果save_params_filename等于None，则模型的参数分别保存到一系列文件中，如果save_params_filename不等于None，则模型的参数会保存到一个文件中，文件名为设置的save_params_filename。默认为None。
+* quantizable_op_type(list[str]): 需要量化的op类型，默认是`['conv2d', 'mul']`，列表中的值可以是任意支持量化的op类型 `['conv2d', 'depthwise_conv2d', 'mul']`。
+* weight_bits(int, optional)：权重量化保存的比特数，可以是8~16，一般设置为8/16。默认为8。
+
+
+## 3 量化模型预测
+
+目前，对于无校准数据训练后量化产出的量化模型，不支持PaddlePaddle加载执行，只能使用PaddleLite进行预测部署。
+
+很简单，首先使用PaddleLite提供的模型转换工具（opt）将量化模型转换成移动端预测的模型，然后加载转换后的模型进行预测部署。
+
+注意，PaddleLite 2.3版本才支持无校准数据训练后量化产出的量化，所以转换工具和预测库必须是2.3及之后的版本。
+
+### 3.1 模型转换
+
+参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具，建议从Release页面下载。
+
+参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具。
+比如在安卓手机ARM端进行预测，模型转换的命令为：
+```bash
+./opt --model_dir=./mobilenet_v1_quant \
+      --optimize_out_type=naive_buffer \
+      --optimize_out=mobilenet_v1_quant_opt \
+      --valid_targets=arm
+```
+
+### 3.2 量化模型预测
+
+和FP32模型一样，转换后的量化模型可以在Android/IOS APP中加载预测，建议参考[C++ Demo](../demo_guides/cpp_demo)、[Java Demo](../demo_guides/java_demo)、[Android/IOS Demo](../demo_guides/android_app_demo)。
diff --git a/docs/user_guides/post_quant_with_data.md b/docs/user_guides/post_quant_with_data.md
new file mode 100644
index 0000000000000000000000000000000000000000..0044b47610a2a211859bdc42f83f1921a681d50b
--- /dev/null
+++ b/docs/user_guides/post_quant_with_data.md
@@ -0,0 +1,202 @@
+# 模型量化-有校准数据训练后量化
+
+本文首先简单介绍有校准数据训练后量化，然后说明产出量化模型、量化模型预测，最后给出一个使用示例。
+如果想快速上手，大家可以先参考使用示例，再查看详细使用方法。
+
+## 1 简介
+
+有校准数据训练后量化，使用少量校准数据计算量化因子，可以快速得到量化模型。使用该量化模型进行预测，可以减少计算量、降低计算内存、减小模型大小。
+
+有校准数据训练后量化中，有两种计算量化因子的方法，非饱和量化方法和饱和量化方法。非饱和量化方法计算整个Tensor的绝对值最大值`abs_max`，将其映射为127。饱和量化方法使用KL散度计算一个合适的阈值`T` (`0<T<mab_max`)，将其映射为127。一般而言，待量化Op的权重采用非饱和量化方法，待量化Op的激活（输入和输出）采用饱和量化方法 。
+
+使用条件：
+* 有训练好的预测模型
+* 有少量校准数据，比如100~500张图片
+
+使用步骤：
+* 产出量化模型：使用PaddlePaddle或者PaddleSlim调用有校准数据训练后量化接口，产出量化模型
+* 量化模型预测：使用PaddleLite加载量化模型进行预测推理
+
+优点：
+* 减小计算量、降低计算内存、减小模型大小
+* 不需要大量训练数据
+* 快速产出量化模型，简单易用
+
+缺点：
+* 对少部分的模型，尤其是计算量小、精简的模型，量化后精度可能会受到影响
+
+## 2 产出量化模型
+
+大家可以使用PaddlePaddle或者PaddleSlim调用有校准数据训练后量化接口，得到量化模型。本文主要介绍使用PaddlePaddle产出量化模型，使用PaddleSlim可以参考[文档](https://github.com/PaddlePaddle/models/tree/develop/PaddleSlim)。
+
+### 2.1 安装PaddlePaddle
+
+参考PaddlePaddle[官网](https://www.paddlepaddle.org.cn/install/quick)，安装PaddlePaddle CPU/GPU 1.7版本。
+
+### 2.2 准备模型和校准数据
+
+准备已经训练好的FP32预测模型，即 `save_inference_model()` 保存的模型。
+准备校准数据集，校准数据集应该是测试集/训练集中随机挑选的一部分，量化因子才会更加准确。对常见的视觉模型，建议校准数据的数量为100~500张图片。
+
+### 2.3 配置校准数据生成器
+
+有校准数据训练后量化内部使用异步数据读取的方式读取校准数据，大家只需要根据模型的输入，配置读取数据的sample_generator。sample_generator是Python生成器，**必须每次返回单个样本数据**，会用作`DataLoader.set_sample_generator()`的数据源。
+建议参考[异步数据读取文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/data_preparing/use_py_reader.html)和本文示例，学习如何配置校准数据生成器。
+
+### 2.4 调用有校准数据训练后量化
+
+对于调用有校准数据训练后量化，首先给出一个例子，让大家有个直观了解。
+
+```python
+import paddle.fluid as fluid
+from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
+
+exe = fluid.Executor(fluid.CPUPlace())
+model_dir = path/to/fp32_model_params
+# set model_filename as None when the filename is __model__, 
+# otherwise set it as the real filename
+model_filename = None 
+# set params_filename as None when all parameters were saved in 
+# separate files, otherwise set it as the real filename
+params_filename = None
+save_model_path = path/to/save_model_path
+# prepare the sample generator according to the model, and the 
+# sample generator must return a sample every time. The reference
+# document: https://www.paddlepaddle.org.cn/documentation/docs/zh
+# /user_guides/howto/prepare_data/use_py_reader.html
+sample_generator = your_sample_generator
+batch_size = 10
+batch_nums = 10
+algo = "KL"
+quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+ptq = PostTrainingQuantization(
+            executor=exe,
+            sample_generator=sample_generator,
+            model_dir=model_dir,
+            model_filename=model_filename,
+            params_filename=params_filename,
+            batch_size=batch_size,
+            batch_nums=batch_nums,
+            algo=algo,
+            quantizable_op_type=quantizable_op_type)
+ptq.quantize()
+ptq.save_quantized_model(save_model_path)
+```
+
+对于调用有校准数据训练后量化，以下对接口进行详细介绍。
+
+``` python
+class PostTrainingQuantization(
+                 executor=None,
+                 scope=None,
+                 model_dir=None,
+                 model_filename=None,
+                 params_filename=None,
+                 sample_generator=None,
+                 batch_size=10,
+                 batch_nums=None,
+                 algo="KL",
+                 quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"],
+                 is_full_quantize=False,
+                 weight_bits=8,
+                 activation_bits=8,
+                 is_use_cache_file=False,
+                 cache_dir="./temp_post_training"):
+```
+调用上述api，传入必要的参数。参数说明如下：
+* executor(fluid.Executor)：执行模型的executor，可以指定在cpu或者gpu上执行。
+* scope(fluid.Scope, optional)：模型运行时使用的scope，默认为None，则会使用global_scope()。行首有optional，说明用户可以不设置该输入参数，直接使用默认值，下同。
+* model_dir(str)：待量化模型的路径，其中保存模型文件和权重文件。
+* model_filename(str, optional)：待量化模型的模型文件名，如果模型文件名不是`__model__`，则需要使用model_filename设置模型文件名。
+* params_filename(str, optional)：待量化模型的权重文件名，如果所有权重保存成一个文件，则需要使用params_filename设置权重文件名。
+* sample_generator(Python Generator)：配置的校准数据生成器。
+* batch_size(int, optional)：一次读取校准数据的数量。
+* batch_nums(int, optional)：读取校准数据的次数。如果设置为None，则从sample_generator中读取所有校准数据进行训练后量化；如果设置为非None，则从sample_generator中读取`batch_size*batch_nums`个校准数据。
+* algo(str, optional)：计算待量化激活Tensor的量化因子的方法。设置为`KL`，则使用饱和量化方法，设置为`direct`，则使用非饱和量化方法。默认为`KL`。
+* quantizable_op_type(list[str], optional): 需要量化的op类型，默认是`["conv2d", "depthwise_conv2d", "mul"]`，列表中的值可以是任意支持量化的op类型。
+* is_full_quantize(bool, optional)：是否进行全量化。设置为True，则对模型中所有支持量化的op进行量化；设置为False，则只对`quantizable_op_type` 中op类型进行量化。目前支持的量化类型如下：'conv2d', 'depthwise_conv2d', 'mul', "pool2d", "elementwise_add", "concat", "softmax", "argmax", "transpose", "equal", "gather", "greater_equal", "greater_than", "less_equal", "less_than", "mean", "not_equal", "reshape", "reshape2", "bilinear_interp", "nearest_interp", "trilinear_interp", "slice", "squeeze", "elementwise_sub"。
+* weight_bits(int, optional)：权重量化的比特数，可以设置为1~16。PaddleLite目前仅支持加载权重量化为8bit的量化模型。
+* activation_bits(int, optional)： 激活量化的比特数，可以设置为1~16。PaddleLite目前仅支持加载激活量化为8bit的量化模型。
+* is_use_cache_file(bool, optional)：是否使用缓存文件。如果设置为True，训练后量化过程中的采样数据会保存到磁盘文件中；如果设置为False，所有采样数据会保存到内存中。当待量化的模型很大或者校准数据数量很大，建议设置is_use_cache_file为True。默认为False。
+* cache_dir(str, optional)：当is_use_cache_file等于True，会将采样数据保存到该文件中。量化完成后，该文件中的临时文件会自动删除。
+
+```python
+PostTrainingQuantization.quantize()
+```
+调用上述接口开始训练后量化。根据校准数据数量、模型的大小和量化op类型不同，训练后量化需要的时间也不一样。比如使用ImageNet2012数据集中100图片对`MobileNetV1`进行训练后量化，花费大概1分钟。
+
+```python
+PostTrainingQuantization.save_quantized_model(save_model_path)
+```
+调用上述接口保存训练后量化模型，其中save_model_path为保存的路径。
+
+训练后量化支持部分量化功能：
+* 方法1：设置quantizable_op_type，则只会对quantizable_op_type中的Op类型进行量化，模型中其他Op类型保持不量化。
+* 方法2：构建网络的时候，将不需要量化的特定Op定义在 `skip_quant` 的name_scope中，则可以跳过特定Op的量化，示例如下。
+```python
+with fluid.name_scope('skip_quant'):
+    pool = fluid.layers.pool2d(input=hidden, pool_size=2, pool_type='avg', pool_stride=2)
+    # 不对pool2d进行量化
+```
+
+## 3 量化模型预测
+
+首先，使用PaddleLite提供的模型转换工具（model_optimize_tool）将量化模型转换成移动端预测的模型，然后加载转换后的模型进行预测部署。
+
+### 3.1 模型转换
+
+参考[模型转换](../user_guides/model_optimize_tool)准备模型转换工具，建议从Release页面下载。
+
+参考[模型转换](../user_guides/model_optimize_tool)使用模型转换工具，参数按照实际情况设置。比如在安卓手机ARM端进行预测，模型转换的命令为：
+```bash
+./opt --model_dir=./mobilenet_v1_quant \
+      --optimize_out_type=naive_buffer \
+      --optimize_out=mobilenet_v1_quant_opt \
+      --valid_targets=arm
+```
+
+### 3.2 量化模型预测
+
+和FP32模型一样，转换后的量化模型可以在Android/IOS APP中加载预测，建议参考[C++ Demo](../demo_guides/cpp_demo)、[Java Demo](../demo_guides/java_demo)、[Android/IOS Demo](../demo_guides/android_app_demo)。
+
+## 4 使用示例
+
+### 4.1 产出量化模型
+
+参考本文 “2.1 安装PaddlePaddle” 安装PaddlePaddle。
+
+下载[打包文件](https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/quantization_demo/post_training_quantization_withdata.tgz)，解压到本地。
+```bash
+wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/quantization_demo/post_training_quantization_withdata.tgz
+tar zxvf post_training_quantization_withdata.tgz
+cd post_training_quantization_withdata
+```
+
+执行下面的命令，自动下载预测模型(mobilenetv1_fp32_model)和校准数据集，然后调用有校准数据训练后方法产出量化模型。
+```bash
+sh run_post_training_quanzation.sh
+```
+
+量化模型保存在mobilenetv1_int8_model文件夹中。
+
+### 4.2 量化模型预测
+
+下载测试文件（[benchmark_bin](https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/quantization_demo/benchmark_bin)）或者参考[Benchmark测试方法](../benchmark/benchmark_tools)编译测试文件。
+
+将mobilenetv1_fp32_model、mobilenetv1_int8_model和benchmark_bin文件都保存到手机上。
+```bash
+adb push mobilenetv1_fp32_model /data/local/tmp
+adb push mobilenetv1_int8_model /data/local/tmp
+chmod 777 benchmark_bin
+adb push benchmark_bin /data/local/tmp
+```
+
+测试量化模型和原始模型的性能，依次执行下面命令：
+```bash
+./benchmark_bin --is_quantized_model=true --run_model_optimize=true  --result_filename=res.txt --warmup=10 --repeats=30  --model_dir=mobilenetv1_int8_model/
+./benchmark_bin --is_quantized_model=true --run_model_optimize=true  --result_filename=res.txt --warmup=10 --repeats=30 --model_dir=mobilenetv1_fp32_model/
+cat res.txt
+```
+
+在res.txt文件中可以看到INT8量化模型和FP32原始模型的速度。
+举例来说，在骁龙855手机、单线程的情况下测试mobilenetv1，INT8量化模型的计算时间是14.52ms，FP32原始模型的计算时间是31.7ms。
diff --git a/docs/user_guides/release_lib.md b/docs/user_guides/release_lib.md
new file mode 100644
index 0000000000000000000000000000000000000000..b7f793f2903e4e0858bd2c30e53785a0ad355fa2
--- /dev/null
+++ b/docs/user_guides/release_lib.md
@@ -0,0 +1,69 @@
+
+# 预编译库
+
+## 编译版本介绍
+
+- ARM_Version=`armv7/armv8`                        arm版本，可选择armv7或者armv8
+
+- arm_os=`android\ios\ios64\armlinux`   安装平台，支持的arm端移动平台包括 `ios\ios64`、`armlinux`和`android`
+
+- arm_lang=`gcc/clang`                                  源码编译时的编译器，默认为`gcc`编译器
+
+- arm_stl=`c++_static/c++_shared`             Lite预测库链接STL库的方式，支持静态或动态链接
+
+- build_extra=`ON/OFF`                                     是否编译全量OP，OFF时只编译CV相关基础OP，[参数详情](library)
+
+-  `tiny_publish/full_publish`                   编译模式，`tiny_publish`编译移动端部署库、`full_publish`编译部署库的同时编译第三方依赖库
+
+
+## Android
+
+|ARM Version|build_extra|arm_stl|target|下载|
+|:-------:|:-----:|:-----:|:-----:|:-------:|
+|armv7|OFF|c++_static|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_static.tiny_publish.tar.gz)|
+|armv7|OFF|c++_static|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_static.full_publish.tar.gz)|
+|armv7|OFF|c++_shared|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_shared.tiny_publish.tar.gz)|
+|armv7|OFF|c++_shared|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_shared.full_publish.tar.gz)|
+|armv7|ON|c++_static|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_static.with_extra.tiny_publish.tar.gz)|
+|armv7|ON|c++_static|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_static.with_extra.full_publish.tar.gz)|
+|armv7|ON|c++_shared|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_shared.with_extra.tiny_publish.tar.gz)|
+|armv7|ON|c++_shared|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv7.gcc.c++_shared.with_extra.full_publish.tar.gz)|
+|armv8|OFF|c++_static|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_static.tiny_publish.tar.gz)|
+|armv8|OFF|c++_static|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_static.full_publish.tar.gz)|
+|armv8|OFF|c++_shared|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_shared.tiny_publish.tar.gz)|
+|armv8|OFF|c++_shared|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_shared.full_publish.tar.gz)|
+|armv8|ON|c++_static|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_static.with_extra.tiny_publish.tar.gz)|
+|armv8|ON|c++_static|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_static.with_extra.full_publish.tar.gz)|
+|armv8|ON|c++_shared|tiny_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_shared.with_extra.tiny_publish.tar.gz)|
+|armv8|ON|c++_shared|full_publish|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.android.armv8.gcc.c++_shared.with_extra.full_publish.tar.gz)|
+
+
+## iOS
+
+|ARM Version|arm_os|with_extra|下载|
+|:-------:|:-----:|:-----:|:-----:|
+|armv7|ios|OFF|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.ios.armv7.tar.gz)|
+|armv7|ios|ON|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.ios.armv7.with_extra.tar.gz)|
+|armv8|ios64|OFF|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.ios64.armv8.tar.gz)|
+|armv8|ios64|ON|[release/v2.3](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.3.0/inference_lite_lib.ios64.armv8.with_extra.tar.gz)|
+
+
+## opt 工具
+
+| 运行系统 |      下载       |
+| :---------: |  :--------------: |
+|    Linux    | [release/v2.3](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt) |
+|    MacOs   | [release/v2.3](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) |
+
+
+
+## 对应源码编译方法
+
+- [opt源码编译](../user_guides/model_optimize_tool.html#opt)
+- [Android源码编译](./source_compile.html#paddlelite)
+- [iOS源码编译](./source_compile.html#paddlelite)
+- [ArmLinux源码编译](./source_compile.html#paddlelite)
+- [x86源码编译](../demo_guides/x86)
+- [opencl源码编译](../demo_guides/opencl)
+- [CUDA源码编译](../demo_guides/cuda)
+- [FPGA源码编译](../demo_guides/fpga)
diff --git a/docs/installation/source_compile.md b/docs/user_guides/source_compile.md
similarity index 97%
rename from docs/installation/source_compile.md
rename to docs/user_guides/source_compile.md
index f2016b83188b755eca8daab8a4aa38b25e08c0f1..8717e579c9398b621ebc6d9a71d3226ee504d7ed 100644
--- a/docs/installation/source_compile.md
+++ b/docs/user_guides/source_compile.md
@@ -1,7 +1,9 @@
 
-# 源码编译
+# 预测库编译
 
-Paddle-Lite 提供了移动端的一键源码编译脚本 `lite/tools/build.sh`，编译流程如下：
+PaddleLite已经提供官方Release预测库下载，请参考[文档](release_lib)。
+
+PaddleLite 提供了移动端的一键源码编译脚本 `lite/tools/build.sh`，编译流程如下：
 
 1. 环境准备（选择其一）：Docker交叉编译环境、Linux交叉编译环境
 2. 编译：调用`build.sh`脚本一键编译
@@ -234,6 +236,8 @@ brew cask install java
 
 ## 二、编译PaddleLite
 
+**注：编译OpenCL、华为NPU、FPGA、CUDA、X86预测库、CV模块，见进阶使用指南的对应章节。**
+
 ### 下载代码
 
 ```shell
@@ -260,7 +264,7 @@ git checkout <release-version-tag>
 | --arm_abi  |必选，选择编译的arm版本，其中`armv7hf`为ARMLinux编译时选用| `armv8`、`armv7`、`armv7hf`(仅`armlinux`支持) |
 | --arm_lang |arm_os=android时必选，选择编译器 | `gcc`、`clang`(`clang`当前暂不支持) |
 | --android_stl |arm_os=android时必选，选择静态链接STL或动态链接STL | `c++_static`、`c++_shared`|
-| --build_java | 可选，是否编译java预测库（默认为OFF） | `ON`、`OFF` |
+| --build_java | 可选，是否编译java预测库（默认为ON） | `ON`、`OFF` |
 | --build_extra | 可选，是否编译全量预测库（默认为OFF）。详情可参考[预测库说明](./library.html)。 | `ON`、`OFF` |
 | target |必选，选择编译模式，`tiny_publish`为编译移动端部署库、`full_publish`为带依赖的移动端部署库、`test`为移动端单元测试、`ios`为编译ios端`tiny_publish` | `tiny_publish`、`full_publish`、`test`、 `ios` |
 
@@ -278,7 +282,6 @@ git checkout <release-version-tag>
   --build_extra=OFF \
   --arm_lang=gcc \
   --android_stl=c++_static \
-  --build_extra=OFF \
   tiny_publish
 ```
 ##### IOS
@@ -306,7 +309,6 @@ sudo xcode-select -s /Applications/Xcode.app/Contents/Developer
   --arm_os=armlinux \
   --arm_abi=armv7hf \
   --arm_lang=gcc \
-  --build_extra=OFF \
   tiny_publish
 ```
 - `--arm_abi`: 树莓派3b使用armv7hf，RK3399使用armv8
@@ -321,7 +323,6 @@ sudo xcode-select -s /Applications/Xcode.app/Contents/Developer
   --build_extra=OFF \
   --arm_lang=gcc \
   --android_stl=c++_static \
-  --build_extra=OFF \
   full_publish
 ```
 ##### ARMLinux
diff --git a/docs/user_guides/tutorial.md b/docs/user_guides/tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..8f8aeb6af124bc4805c281e22e39cca51b507651
--- /dev/null
+++ b/docs/user_guides/tutorial.md
@@ -0,0 +1,54 @@
+# 使用流程
+
+Lite是一种轻量级、灵活性强、易于扩展的高性能的深度学习预测框架，它可以支持诸如ARM、OpenCL、NPU等等多种终端，同时拥有强大的图优化及预测加速能力。如果您希望将Lite框架集成到自己的项目中，那么只需要如下几步简单操作即可。
+
+## 一. 准备模型
+
+Lite框架目前支持的模型结构为[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)深度学习框架产出的模型格式。因此，在您开始使用 Lite 框架前您需要准备一个由PaddlePaddle框架保存的模型。
+如果您手中的模型是由诸如Caffe2、Tensorflow等框架产出的，那么我们推荐您使用 [X2Paddle](https://github.com/PaddlePaddle/X2Paddle) 工具进行模型格式转换。
+
+## 二. 模型优化
+
+Lite框架拥有强大的加速、优化策略及实现，其中包含诸如量化、子图融合、Kernel优选等等优化手段，为了方便您使用这些优化策略，我们提供了[opt](model_optimize_tool)帮助您轻松进行模型优化。优化后的模型更轻量级，耗费资源更少，并且执行速度也更快。
+
+opt的详细介绍，请您参考 [模型优化方法](model_optimize_tool) 。
+
+使用opt，您只需编译后在开发机上执行以下代码：
+
+``` shell
+$ cd <PaddleLite_base_path>
+$ cd build.opt/lite/api/
+$ ./opt \
+    --model_dir=<model_param_dir> \
+    --model_file=<model_path> \
+    --param_file=<param_path> \
+    --optimize_out_type=(protobuf|naive_buffer) \
+    --optimize_out=<output_optimize_model_dir> \
+    --valid_targets=(arm|opencl|x86)
+```
+
+其中，optimize_out为您希望的优化模型的输出路径。optimize_out_type则可以指定输出模型的序列化方式，其目前支持Protobuf与Naive Buffer两种方式，其中Naive Buffer是一种更轻量级的序列化/反序列化实现。如果你需要使用Lite在mobile端进行预测，那么您需要设置optimize_out_type=naive_buffer。
+
+## 三. 使用Lite框架执行预测
+
+在上一节中，我们已经通过`opt`获取到了优化后的模型，使用优化模型进行预测也十分的简单。为了方便您的使用，Lite进行了良好的API设计，隐藏了大量您不需要投入时间研究的细节。您只需要简单的五步即可使用Lite在移动端完成预测（以C++ API进行说明）：
+
+
+1. 声明MobileConfig。在config中可以设置**从文件加载模型**也可以设置**从memory加载模型**。从文件加载模型需要声明模型文件路径，如 `config.set_model_from_file(FLAGS_model_file)` ；从memory加载模型方法现只支持加载优化后模型的naive buffer，实现方法为：
+`void set_model_from_buffer(model_buffer) `
+
+2. 创建Predictor。Predictor即为Lite框架的预测引擎，为了方便您的使用我们提供了 `CreatePaddlePredictor` 接口，你只需要简单的执行一行代码即可完成预测引擎的初始化，`std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config)` 。
+3. 准备输入。执行predictor->GetInput(0)您将会获得输入的第0个field，同样的，如果您的模型有多个输入，那您可以执行 `predictor->GetInput(i)` 来获取相应的输入变量。得到输入变量后您可以使用Resize方法指定其具体大小，并填入输入值。
+4. 执行预测。您只需要执行 `predictor->Run()` 即可使用Lite框架完成预测。
+5. 获取输出。与输入类似，您可以使用 `predictor->GetOutput(i)` 来获得输出的第i个变量。您可以通过其shape()方法获取输出变量的维度，通过 `data<T>()` 模板方法获取其输出值。
+
+
+
+
+## 四. Lite API
+
+为了方便您的使用，我们提供了C++、Java、Python三种API，并且提供了相应的api的完整使用示例:[C++完整示例](../demo_guides/cpp_demo)、[Java完整示例](../demo_guides/java_demo)、[Python完整示例](../demo_guides/cuda)，您可以参考示例中的说明快速了解C++/Java/Python的API使用方法，并集成到您自己的项目中去。需要说明的是，为了减少第三方库的依赖、提高Lite预测框架的通用性，在移动端使用Lite API您需要准备Naive Buffer存储格式的模型，具体方法可参考第2节`模型优化`。
+
+## 五. 测试工具
+
+为了使您更好的了解并使用Lite框架，我们向有进一步使用需求的用户开放了 [Debug工具](debug#debug) 和 [Profile工具](debug#profiler)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异，进一步快速定位问题Op，方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗，其会自动统计Op执行的次数，最长、最短、平均执行时间等等信息，为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。
diff --git a/docs/user_guides/x2paddle.md b/docs/user_guides/x2paddle.md
new file mode 100644
index 0000000000000000000000000000000000000000..7e44ba980cc6836189d3f1a03bbbf29c8d7bd5c1
--- /dev/null
+++ b/docs/user_guides/x2paddle.md
@@ -0,0 +1,69 @@
+# 模型转换工具 X2Paddle
+
+X2Paddle可以将caffe、tensorflow、onnx模型转换成Paddle支持的模型。
+
+[X2Paddle](https://github.com/PaddlePaddle/X2Paddle)支持将Caffe/TensorFlow模型转换为PaddlePaddle模型。目前X2Paddle支持的模型参考[x2paddle_model_zoo](https://github.com/PaddlePaddle/X2Paddle/blob/develop/x2paddle_model_zoo.md)。
+
+
+## 多框架支持
+
+|模型 | caffe | tensorflow | onnx | 
+|---|---|---|---|
+|mobilenetv1 | Y | Y |  | 
+|mobilenetv2 | Y | Y | Y | 
+|resnet18 | Y | Y |  | 
+|resnet50 | Y | Y | Y | 
+|mnasnet | Y | Y |  | 
+|efficientnet | Y | Y | Y | 
+|squeezenetv1.1 | Y | Y | Y | 
+|shufflenet | Y | Y |  | 
+|mobilenet_ssd | Y | Y |  | 
+|mobilenet_yolov3 |  | Y |  | 
+|inceptionv4 |  |  |  | 
+|mtcnn | Y | Y |  | 
+|facedetection | Y |  |  | 
+|unet | Y | Y |  | 
+|ocr_attention |  |  |  | 
+|vgg16 |  |  |  | 
+
+
+## 安装
+
+```
+pip install x2paddle
+```
+
+安装最新版本，可使用如下安装方式
+
+```
+pip install git+https://github.com/PaddlePaddle/X2Paddle.git@develop
+```
+
+## 使用
+
+### Caffe
+
+```
+x2paddle --framework caffe \
+         --prototxt model.proto \
+	 --weight model.caffemodel \
+         --save_dir paddle_model
+```
+
+### TensorFlow
+
+```
+x2paddle --framework tensorflow \
+	 --model model.pb \
+	 --save_dir paddle_model
+```
+
+## 转换结果说明
+
+在指定的`save_dir`下生成两个目录  
+1. inference_model : 模型结构和参数均序列化保存的模型格式
+2. model_with_code : 保存了模型参数文件和模型的python代码
+
+## 问题反馈
+
+X2Paddle使用时存在问题时，欢迎您将问题或Bug报告以[Github Issues](https://github.com/PaddlePaddle/X2Paddle/issues)的形式提交给我们，我们会实时跟进。
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index bac6f80c4721e0c5de201eebfe7e6a39a0bdc73a..a39c0a02681f16578ae81c74d83979fe0c57e6c6 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -12,6 +12,7 @@ message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
 message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
+message(STATUS "LITE_WITH_ARM_LANG:\t${LITE_WITH_ARM_LANG}")
 
 set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install")
 set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK})
@@ -64,6 +65,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
     if (LITE_WITH_NPU)
         set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.npu")
     endif(LITE_WITH_NPU)
+    if (LITE_WITH_XPU)
+        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.xpu")
+    endif(LITE_WITH_XPU)
     if (LITE_WITH_FPGA)
         set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.fpga")
     endif(LITE_WITH_FPGA)
@@ -79,7 +83,16 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}")
 if (LITE_WITH_PYTHON)
     add_custom_target(publish_inference_python_lib ${TARGET}
             COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
-            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite_core.so")
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
+            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
+    add_custom_target(publish_inference_python_installer ${TARGET}
+        COMMAND python setup.py bdist_wheel
+        WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
+        DEPENDS publish_inference_python_lib)
     add_custom_target(publish_inference_python_light_demo ${TARGET}
     	COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/python"
     	COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/")
@@ -91,6 +104,7 @@ if (LITE_WITH_PYTHON)
     endif()
     add_dependencies(publish_inference_python_lib lite_pybind)
     add_dependencies(publish_inference publish_inference_python_lib)
+    add_dependencies(publish_inference publish_inference_python_installer)
     add_dependencies(publish_inference publish_inference_python_light_demo)
 endif()
 
@@ -123,7 +137,29 @@ if (LITE_WITH_X86)
 endif()
 
 if(LITE_WITH_CUDA)
-    add_dependencies(publish_inference paddle_full_api_shared)
+    add_custom_target(publish_inference_cuda_cxx_lib ${TARGET}
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+        )
+    add_dependencies(publish_inference_cuda_cxx_lib bundle_full_api)
+    add_dependencies(publish_inference_cuda_cxx_lib bundle_light_api)
+    add_dependencies(publish_inference_cuda_cxx_lib paddle_full_api_shared)
+    add_dependencies(publish_inference_cuda_cxx_lib paddle_light_api_shared)
+    add_dependencies(publish_inference publish_inference_cuda_cxx_lib)
+
+    add_custom_target(publish_inference_cuda_cxx_demos ${TARGET}
+           COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
+           COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party"
+           COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+           COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/cuda_demo/*" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+           )
+    add_dependencies(publish_inference_cuda_cxx_lib publish_inference_cuda_cxx_demos)
+    add_dependencies(publish_inference_cuda_cxx_demos paddle_full_api_shared)
 endif(LITE_WITH_CUDA) 
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
     if (NOT LITE_ON_TINY_PUBLISH)
@@ -135,22 +171,23 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
                 COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
                 COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
-                #COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/model_optimize_tool" "${INFER_LITE_PUBLISH_ROOT}/bin"
                 COMMAND cp "${CMAKE_BINARY_DIR}/lite/gen_code/paddle_code_generator" "${INFER_LITE_PUBLISH_ROOT}/bin"
                 COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
                 )
             if(NOT IOS)
-                #add_dependencies(publish_inference_cxx_lib model_optimize_tool)
                 add_dependencies(publish_inference_cxx_lib paddle_code_generator)
                 add_dependencies(publish_inference_cxx_lib bundle_full_api)
                 add_dependencies(publish_inference_cxx_lib bundle_light_api)
                 add_dependencies(publish_inference_cxx_lib test_model_bin)
+                add_dependencies(publish_inference_cxx_lib benchmark_bin)
                 if (ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")
                     add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
                     add_dependencies(publish_inference paddle_light_api_shared)
                     add_custom_command(TARGET publish_inference_cxx_lib
-                          COMMAND cp ${CMAKE_BINARY_DIR}/lite/api/*.so ${INFER_LITE_PUBLISH_ROOT}/cxx/lib)
+                          COMMAND cp ${CMAKE_BINARY_DIR}/lite/api/*.so ${INFER_LITE_PUBLISH_ROOT}/cxx/lib
+                          COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/benchmark_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
+                          )
                 endif()
                 add_dependencies(publish_inference publish_inference_cxx_lib)
                 if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
@@ -185,6 +222,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                 add_dependencies(publish_inference tiny_publish_cxx_lib)
                 if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
                     add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
+                                COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_api_light_bundled.a
                                 COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
                 endif()
             endif()
@@ -281,6 +319,10 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
             COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/opencl"
             COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/backends/opencl/cl_kernel" "${INFER_LITE_PUBLISH_ROOT}/opencl"
         )
+       if (NOT LITE_ON_TINY_PUBLISH)
         add_dependencies(publish_inference_cxx_lib publish_inference_opencl)
+       else()
+        add_dependencies(tiny_publish_cxx_lib publish_inference_opencl)
+       endif()
     endif()
 endif()
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index f7f74ab5822a1305e3e8d24cf36a0a458a6494ff..b360b476e0c99a62ea39a70241b548bddf5a872a 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -45,7 +45,11 @@ else()
     if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux"))
         add_library(paddle_light_api_shared SHARED "")
         target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc)
-        set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
+        set(TARGET_COMIPILE_FLAGS "-fdata-sections")
+        if (NOT (ARM_TARGET_LANG STREQUAL "clang")) #gcc
+            set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto")
+        endif()
+        set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "${TARGET_COMIPILE_FLAGS}")
         add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
         if (LITE_WITH_NPU)
             # Need to add HIAI runtime libs (libhiai.so) dependency
@@ -78,6 +82,7 @@ message(STATUS "get X86 kernels ${x86_kernels}")
 message(STATUS "get CUDA kernels ${cuda_kernels}")
 message(STATUS "get Host kernels ${host_kernels}")
 message(STATUS "get ARM kernels ${arm_kernels}")
+message(STATUS "get OpenCL kernels ${opencl_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
 message(STATUS "get XPU kernels ${xpu_kernels}")
 message(STATUS "get FPGA kernels ${fpga_kernels}")
@@ -143,38 +148,40 @@ if(WITH_TESTING)
             --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
     add_dependencies(test_cxx_api extern_lite_download_lite_naive_model_tar_gz)
     if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-        lite_cc_test(test_googlenet SRCS test_googlenet_lite.cc
-           DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
-           ${ops} ${host_kernels} ${x86_kernels}
-           ARGS --model_dir=${LITE_MODEL_DIR}/googlenet)
-        add_dependencies(test_googlenet extern_lite_download_GoogleNet_inference_tar_gz)
-        lite_cc_test(test_mobilenetv1_lite_x86 SRCS test_mobilenetv1_lite_x86.cc
-           DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
-           ${ops} ${host_kernels} ${x86_kernels}
-           ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1)
-        add_dependencies(test_mobilenetv1_lite_x86 extern_lite_download_mobilenet_v1_tar_gz)
-        lite_cc_test(test_mobilenetv2_lite_x86 SRCS test_mobilenetv2_lite_x86.cc
-           DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
-           ${ops} ${host_kernels} ${x86_kernels}
-           ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v2_relu)
-        add_dependencies(test_mobilenetv2_lite_x86 extern_lite_download_mobilenet_v2_relu_tar_gz)
-        lite_cc_test(test_inceptionv4_lite_x86 SRCS test_inceptionv4_lite_x86.cc
-           DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
-           ${ops} ${host_kernels} ${x86_kernels}
-           ARGS --model_dir=${LITE_MODEL_DIR}/inception_v4_simple)
-        add_dependencies(test_inceptionv4_lite_x86 extern_lite_download_inception_v4_simple_tar_gz)
-        lite_cc_test(test_resnet50_lite_x86 SRCS test_resnet50_lite_x86.cc
-           DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
-           ${ops} ${host_kernels} ${x86_kernels}
-           ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
-        add_dependencies(test_resnet50_lite_x86 extern_lite_download_resnet50_tar_gz)
-        lite_cc_test(test_step_rnn_lite_x86 SRCS test_step_rnn_lite_x86.cc
-           DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
-           ${ops} ${host_kernels} ${x86_kernels}
-           ARGS --model_dir=${LITE_MODEL_DIR}/step_rnn)
-        add_dependencies(test_step_rnn_lite_x86 extern_lite_download_step_rnn_tar_gz)
+        if(LITE_WITH_X86)
+            lite_cc_test(test_googlenet SRCS test_googlenet_lite.cc
+               DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+               ${ops} ${host_kernels} ${x86_kernels}
+               ARGS --model_dir=${LITE_MODEL_DIR}/googlenet)
+            add_dependencies(test_googlenet extern_lite_download_GoogleNet_inference_tar_gz)
+            lite_cc_test(test_mobilenetv1_lite_x86 SRCS test_mobilenetv1_lite_x86.cc
+               DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+               ${ops} ${host_kernels} ${x86_kernels}
+               ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1)
+            add_dependencies(test_mobilenetv1_lite_x86 extern_lite_download_mobilenet_v1_tar_gz)
+            lite_cc_test(test_mobilenetv2_lite_x86 SRCS test_mobilenetv2_lite_x86.cc
+               DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+               ${ops} ${host_kernels} ${x86_kernels}
+               ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v2_relu)
+            add_dependencies(test_mobilenetv2_lite_x86 extern_lite_download_mobilenet_v2_relu_tar_gz)
+            lite_cc_test(test_inceptionv4_lite_x86 SRCS test_inceptionv4_lite_x86.cc
+               DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+               ${ops} ${host_kernels} ${x86_kernels}
+               ARGS --model_dir=${LITE_MODEL_DIR}/inception_v4_simple)
+            add_dependencies(test_inceptionv4_lite_x86 extern_lite_download_inception_v4_simple_tar_gz)
+            lite_cc_test(test_resnet50_lite_x86 SRCS test_resnet50_lite_x86.cc
+               DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+               ${ops} ${host_kernels} ${x86_kernels}
+               ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+            add_dependencies(test_resnet50_lite_x86 extern_lite_download_resnet50_tar_gz)
+            lite_cc_test(test_step_rnn_lite_x86 SRCS test_step_rnn_lite_x86.cc
+               DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+               ${ops} ${host_kernels} ${x86_kernels}
+               ARGS --model_dir=${LITE_MODEL_DIR}/step_rnn)
+            add_dependencies(test_step_rnn_lite_x86 extern_lite_download_step_rnn_tar_gz)
+        endif()
         if(LITE_WITH_BM)
-           lite_cc_test(test_resnet50_lite_bm SRCS test_resnet50_lite_bm.cc
+           lite_cc_test(test_classify_lite_bm SRCS test_classify_lite_bm.cc
               DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
               ${ops} ${host_kernels} ${bm_kernels} ${bm_bridges}
               ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
@@ -229,6 +236,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
             --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
     add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
+   # brief: we comment ocr_test_ut because we do not supply ocr model to test, it is the reference to infer nlp model
    # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
    #    DEPS ${lite_model_test_DEPS})
 
@@ -295,6 +303,11 @@ if (LITE_ON_TINY_PUBLISH)
     return()
 endif()
 
+
+# add library for opt_base
+lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
+add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)
+
 if (LITE_ON_MODEL_OPTIMIZE_TOOL)
     message(STATUS "Compiling opt")
     lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
@@ -330,6 +343,30 @@ if(NOT IOS)
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
+    
+    lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils
+        ${ops} ${host_kernels}
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels}
+        XPU_DEPS ${xpu_kernels}
+        CL_DEPS ${opencl_kernels}
+        BM_DEPS ${bm_kernels}
+        FPGA_DEPS ${fpga_kernels}
+        X86_DEPS ${x86_kernels}
+        CUDA_DEPS ${cuda_kernels})
+    
+    lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils
+        ${ops} ${host_kernels}
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels}
+        XPU_DEPS ${xpu_kernels}
+        CL_DEPS ${opencl_kernels}
+        BM_DEPS ${bm_kernels}
+        FPGA_DEPS ${fpga_kernels}
+        X86_DEPS ${x86_kernels}
+        CUDA_DEPS ${cuda_kernels})
 
     lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
         ${ops} ${host_kernels}
@@ -341,6 +378,7 @@ if(NOT IOS)
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
+    
     lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
         ${ops} ${host_kernels}
         ARM_DEPS ${arm_kernels}
@@ -352,6 +390,16 @@ if(NOT IOS)
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
+    lite_cc_binary(test_transformer SRCS transform_test.cc DEPS paddle_api_full paddle_api_light gflags utils
+        ${ops} ${host_kernels}
+        ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
+        NPU_DEPS ${npu_kernels}
+        XPU_DEPS ${xpu_kernels}
+        CL_DEPS ${opencl_kernels}
+        FPGA_DEPS ${fpga_kernels}
+        X86_DEPS ${x86_kernels}
+        CUDA_DEPS ${cuda_kernels})
 endif()
 
 #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
diff --git a/lite/api/android/jni/native/CMakeLists.txt b/lite/api/android/jni/native/CMakeLists.txt
index c1766772f8aaa417c3da1d72f2692c10c10194b4..d46e9f7cdec1cf422340ff11165ee166c7520bab 100644
--- a/lite/api/android/jni/native/CMakeLists.txt
+++ b/lite/api/android/jni/native/CMakeLists.txt
@@ -25,7 +25,11 @@ if (NOT LITE_ON_TINY_PUBLISH)
     endif()
 else()
     add_library(paddle_lite_jni SHARED "")
-    set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
+    set(TARGET_COMIPILE_FLAGS "-fdata-sections")
+    if (NOT (ARM_TARGET_LANG STREQUAL "clang")) #gcc
+        set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto")
+    endif()
+    set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS ${TARGET_COMIPILE_FLAGS})
     target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc)
     add_dependencies(paddle_lite_jni op_list_h kernel_list_h)
     if (LITE_WITH_NPU)
diff --git a/lite/api/android/jni/native/paddle_lite_jni.h b/lite/api/android/jni/native/paddle_lite_jni.h
index f447ce105a1ca7b2d94a00287d2b699f920a09af..983f108a869db91c7cfeb9eb539286e2a3f0bf99 100644
--- a/lite/api/android/jni/native/paddle_lite_jni.h
+++ b/lite/api/android/jni/native/paddle_lite_jni.h
@@ -17,11 +17,6 @@
 #include <jni.h>
 /* Header for class com_baidu_paddle_lite_PaddlePredictor */
 #include "lite/api/paddle_lite_factory_helper.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#ifndef LITE_ON_TINY_PUBLISH
-#include "lite/api/paddle_use_passes.h"
-#endif
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java b/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java
index e150f98f22113ef6bcedd5e9882e0bd2a6378c97..fe05c4302c71b439ae125e165244146726b3bf3d 100644
--- a/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java
+++ b/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java
@@ -78,7 +78,7 @@ public class MobileConfig extends ConfigBase {
      *  
      * @return liteModelFile
      */
-    public String getModelFile() {
+    public String getModelFromFile() {
         return liteModelFile;
     }
 
@@ -96,7 +96,7 @@ public class MobileConfig extends ConfigBase {
      *  
      * @return liteModelBuffer
      */
-    public String getModelBuffer() {
+    public String getModelFromBuffer() {
         return liteModelBuffer;
     }
 
diff --git a/lite/api/apis_test.cc b/lite/api/apis_test.cc
index bb852297d11a8862460ed6f12e007d727aca9428..917f2a73a95c3fbd7464fd40824b833993a2a18c 100644
--- a/lite/api/apis_test.cc
+++ b/lite/api/apis_test.cc
@@ -21,9 +21,6 @@
 #include <vector>
 #include "lite/api/cxx_api.h"
 #include "lite/api/light_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
 #include "lite/core/mir/pass_registry.h"
 
 DEFINE_string(model_dir, "", "");
diff --git a/lite/api/benchmark.cc b/lite/api/benchmark.cc
index 718dbe44296f2d197efc5b567cf0cc211835d176..d53de7bf2ed00fed70bbd1f70729a051e5d7203b 100644
--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -23,31 +23,28 @@
 #include <string>
 #include <vector>
 #include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
 #include "lite/core/device_info.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
 
 DEFINE_string(model_dir,
               "",
-              "the path of the model, set model_dir when the model is no "
-              "combined formate. This option will be ignored if model_file "
-              "and param_file are exist.");
-DEFINE_string(model_file,
+              "the path of the model, the model and param files is under "
+              "model_dir.");
+DEFINE_string(model_filename,
               "",
-              "the path of model file, set model_file when the model is "
-              "combined formate.");
-DEFINE_string(param_file,
+              "the filename of model file. When the model is combined formate, "
+              "please set model_file.");
+DEFINE_string(param_filename,
               "",
-              "the path of param file, set param_file when the model is "
+              "the filename of param file, set param_file when the model is "
               "combined formate.");
 DEFINE_string(input_shape,
               "1,3,224,224",
               "set input shapes according to the model, "
               "separated by colon and comma, "
-              "such as 1,3,244,244:1,3,300,300.");
+              "such as 1,3,244,244");
+DEFINE_string(input_img_path, "", "the path of input image");
 DEFINE_int32(warmup, 0, "warmup times");
 DEFINE_int32(repeats, 1, "repeats times");
 DEFINE_int32(power_mode,
@@ -80,12 +77,13 @@ inline double GetCurrentUS() {
   return 1e+6 * time.tv_sec + time.tv_usec;
 }
 
-void OutputOptModel(const std::string& save_optimized_model_dir,
-                    const std::vector<std::vector<int64_t>>& input_shapes) {
+void OutputOptModel(const std::string& save_optimized_model_dir) {
   lite_api::CxxConfig config;
   config.set_model_dir(FLAGS_model_dir);
-  config.set_model_file(FLAGS_model_file);
-  config.set_param_file(FLAGS_param_file);
+  if (!FLAGS_model_filename.empty() && !FLAGS_param_filename.empty()) {
+    config.set_model_file(FLAGS_model_dir + "/" + FLAGS_model_filename);
+    config.set_param_file(FLAGS_model_dir + "/" + FLAGS_param_filename);
+  }
   std::vector<Place> vaild_places = {
       Place{TARGET(kARM), PRECISION(kFloat)},
   };
@@ -109,7 +107,7 @@ void OutputOptModel(const std::string& save_optimized_model_dir,
 }
 
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-void Run(const std::vector<std::vector<int64_t>>& input_shapes,
+void Run(const std::vector<int64_t>& input_shape,
          const std::string& model_dir,
          const std::string model_name) {
   // set config and create predictor
@@ -121,17 +119,27 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
   // set input
-  for (int j = 0; j < input_shapes.size(); ++j) {
-    auto input_tensor = predictor->GetInput(j);
-    input_tensor->Resize(input_shapes[j]);
-    auto input_data = input_tensor->mutable_data<float>();
-    int input_num = 1;
-    for (size_t i = 0; i < input_shapes[j].size(); ++i) {
-      input_num *= input_shapes[j][i];
-    }
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize(input_shape);
+  auto input_data = input_tensor->mutable_data<float>();
+  int input_num = 1;
+  for (size_t i = 0; i < input_shape.size(); ++i) {
+    input_num *= input_shape[i];
+  }
+  if (FLAGS_input_img_path.empty()) {
     for (int i = 0; i < input_num; ++i) {
       input_data[i] = 1.f;
     }
+  } else {
+    std::fstream fs(FLAGS_input_img_path);
+    if (!fs.is_open()) {
+      LOG(FATAL) << "open input image " << FLAGS_input_img_path << " error.";
+    }
+    for (int i = 0; i < input_num; i++) {
+      fs >> input_data[i];
+    }
+    // LOG(INFO) << "input data:" << input_data[0] << " " <<
+    // input_data[input_num-1];
   }
 
   // warmup
@@ -178,25 +186,12 @@ int main(int argc, char** argv) {
     exit(0);
   }
 
+  if (FLAGS_model_dir.back() == '/') {
+    FLAGS_model_dir.pop_back();
+  }
   std::size_t found = FLAGS_model_dir.find_last_of("/");
   std::string model_name = FLAGS_model_dir.substr(found + 1);
-  std::string save_optimized_model_dir = FLAGS_model_dir + "opt2";
-
-  auto split_string =
-      [](const std::string& str_in) -> std::vector<std::string> {
-    std::vector<std::string> str_out;
-    std::string tmp_str = str_in;
-    while (!tmp_str.empty()) {
-      size_t next_offset = tmp_str.find(":");
-      str_out.push_back(tmp_str.substr(0, next_offset));
-      if (next_offset == std::string::npos) {
-        break;
-      } else {
-        tmp_str = tmp_str.substr(next_offset + 1);
-      }
-    }
-    return str_out;
-  };
+  std::string save_optimized_model_dir = FLAGS_model_dir + "_opt2";
 
   auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
     std::vector<int64_t> shape;
@@ -214,22 +209,18 @@ int main(int argc, char** argv) {
     return shape;
   };
 
-  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
-  std::vector<std::vector<int64_t>> input_shapes;
-  for (size_t i = 0; i < str_input_shapes.size(); ++i) {
-    input_shapes.push_back(get_shape(str_input_shapes[i]));
-  }
+  std::vector<int64_t> input_shape = get_shape(FLAGS_input_shape);
 
   // Output optimized model if needed
   if (FLAGS_run_model_optimize) {
-    paddle::lite_api::OutputOptModel(save_optimized_model_dir, input_shapes);
+    paddle::lite_api::OutputOptModel(save_optimized_model_dir);
   }
 
 #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
   // Run inference using optimized model
   std::string run_model_dir =
       FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
-  paddle::lite_api::Run(input_shapes, run_model_dir, model_name);
+  paddle::lite_api::Run(input_shape, run_model_dir, model_name);
 #endif
   return 0;
 }
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index f6f7ec75e65ff54e3f3642822e51057d3522ae3a..556a9e0af01854ff5c57a14dade72b81ed255964 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -294,6 +294,32 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
   inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
   inner_places.emplace_back(
       TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+
+  const std::vector<std::string> quant_dequant_op = {
+      "fake_quantize_abs_max",
+      "fake_quantize_range_abs_max",
+      "fake_quantize_moving_average_abs_max",
+      "fake_quantize_dequantize_moving_average_abs_max",
+      "fake_dequantize_max_abs",
+      "fake_channel_wise_dequantize_max_abs"};
+  bool is_quantized_model = false;
+  for (size_t i = 0; i < program_desc_.BlocksSize() && !is_quantized_model;
+       ++i) {
+    auto *block_desc = program_desc_.GetBlock<cpp::BlockDesc>(i);
+    for (size_t j = 0; j < block_desc->OpsSize() && !is_quantized_model; ++j) {
+      auto *op_desc = block_desc->GetOp<cpp::OpDesc>(j);
+      std::string op_type = op_desc->Type();
+      if (std::find(quant_dequant_op.begin(),
+                    quant_dequant_op.end(),
+                    op_type) != quant_dequant_op.end()) {
+        is_quantized_model = true;
+      }
+    }
+  }
+  if (is_quantized_model) {
+    inner_places.emplace_back(Place{TARGET(kARM), PRECISION(kInt8)});
+  }
+
   Program program(desc, scope_, inner_places);
 
   core::KernelPickFactor factor;
@@ -333,16 +359,16 @@ lite::Tensor *Predictor::GetInputByName(const std::string &name) {
   }
 }
 
-#ifdef LITE_WITH_TRAIN
-void Predictor::FeedVars(const std::vector<framework::Tensor> &tensors) {
-  auto var = scope_->FindVar("feed");
-  auto &feed_list = *(var->GetMutable<std::vector<lite::Tensor>>());
-  feed_list.resize(tensors.size());
+// #ifdef LITE_WITH_TRAIN
+// void Predictor::FeedVars(const std::vector<framework::Tensor> &tensors) {
+//   auto var = scope_->FindVar("feed");
+//   auto &feed_list = *(var->GetMutable<std::vector<lite::Tensor>>());
+//   feed_list.resize(tensors.size());
 
-  for (size_t i = 0; i < tensors.size(); ++i)
-    feed_list[i].ShareDataWith(tensors[i]);
-}
-#endif
+//   for (size_t i = 0; i < tensors.size(); ++i)
+//     feed_list[i].ShareDataWith(tensors[i]);
+// }
+// #endif
 
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h
index 504710d9fa29420b8762f31e0c675b59c6c626bd..e63893cb91e112beb6be50bd661a57b9738e5fb1 100644
--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -101,14 +101,14 @@ class LITE_API Predictor {
       bool record_info = false);
   void SaveOpKernelInfo(const std::string& model_dir);
 
-#ifdef LITE_WITH_TRAIN
-  void Run(const std::vector<framework::Tensor>& tensors) {
-    FeedVars(tensors);
-    program_->Run();
-  }
-
-  void FeedVars(const std::vector<framework::Tensor>& tensors);
-#endif
+  // #ifdef LITE_WITH_TRAIN
+  //   void Run(const std::vector<framework::Tensor>& tensors) {
+  //     FeedVars(tensors);
+  //     program_->Run();
+  //   }
+
+  //   void FeedVars(const std::vector<framework::Tensor>& tensors);
+  // #endif
 
  private:
   Optimizer optimizer_;
diff --git a/lite/api/cxx_api_bin.cc b/lite/api/cxx_api_bin.cc
index 8c929e9c8700a65c868e2facd763b0ec36719e23..eec17cc30e308e7169b7d8c394c0e47eee0c1c3e 100644
--- a/lite/api/cxx_api_bin.cc
+++ b/lite/api/cxx_api_bin.cc
@@ -67,7 +67,7 @@ void Run(const char* model_dir, int repeat) {
 
 int main(int argc, char** argv) {
   CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>";
-  paddle::lite::Run(argv[1], std::stoi(argv[2]));
+  paddle::lite::Run(argv[1], atoi(argv[2]));
 
   return 0;
 }
diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
index 81ea60eac66849f8ce42fb8cb210226d18bbfa9b..972210c8f9ea05ba1b041382c43efad64aeacc1b 100644
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -35,8 +35,16 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
   Env<TARGET(kCUDA)>::Init();
 #endif
   auto places = config.valid_places();
-  raw_predictor_.Build(config, places);
-
+  std::vector<std::string> passes{};
+  auto use_layout_preprocess_pass =
+      config.model_dir().find("OPENCL_PRE_PRECESS");
+  VLOG(1) << "use_layout_preprocess_pass:" << use_layout_preprocess_pass;
+  if (places[0].target == TARGET(kOpenCL) &&
+      use_layout_preprocess_pass != std::string::npos) {
+    passes = {"type_layout_cast_preprocess_pass"};
+    VLOG(1) << "add pass:" << passes[0];
+  }
+  raw_predictor_.Build(config, places, passes);
   mode_ = config.power_mode();
   threads_ = config.threads();
 
diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc
index 29d8f4f29ab822f8c9601bbd63a3626abbbf1818..b641973a15b2e6abc1cf4c999d759271f7522638 100644
--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -13,6 +13,12 @@
 // limitations under the License.
 
 #include "lite/api/light_api.h"
+#include "paddle_use_kernels.h"  // NOLINT
+#include "paddle_use_ops.h"      // NOLINT
+#ifndef LITE_ON_TINY_PUBLISH
+#include "lite/api/paddle_use_passes.h"
+#endif
+
 #include <algorithm>
 
 namespace paddle {
@@ -25,6 +31,8 @@ void LightPredictor::Build(const std::string& lite_model_file,
   } else {
     LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
   }
+
+  DequantizeWeight();
   BuildRuntimeProgram(cpp_program_desc_);
   PrepareFeedFetch();
 }
diff --git a/lite/api/light_api_impl.cc b/lite/api/light_api_impl.cc
index 3965843250abe45c43490bdbb4aaed58915e0908..cdf5b7fb06df35b2e7fb72fc4e33ccb721a0f7f7 100644
--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -58,6 +58,7 @@ void LightPredictorImpl::Run() {
 
 std::shared_ptr<lite_api::PaddlePredictor> LightPredictorImpl::Clone() {
   LOG(FATAL) << "The Clone API is not supported in LigthPredictor";
+  return nullptr;
 }
 
 std::string LightPredictorImpl::GetVersion() const { return lite::version(); }
diff --git a/lite/api/light_api_shared.cc b/lite/api/light_api_shared.cc
index 557804bfa56787fa8a83bfbfc3046df08be010f8..cfe3d9de09a646e33c4a116bb3cd087d28aa24c2 100644
--- a/lite/api/light_api_shared.cc
+++ b/lite/api/light_api_shared.cc
@@ -12,11 +12,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#ifndef LITE_ON_TINY_PUBLISH
-#include "lite/api/paddle_use_passes.h"
-#endif
 
 namespace paddle {
 namespace lite_api {
diff --git a/lite/api/light_api_test.cc b/lite/api/light_api_test.cc
index 7d322530f624c43737018d8ece98fb24d48bc16a..b49ff8b80c936b93acd630c6e0cde03df8b22ee4 100644
--- a/lite/api/light_api_test.cc
+++ b/lite/api/light_api_test.cc
@@ -15,9 +15,6 @@
 #include "lite/api/light_api.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
 
 DEFINE_string(optimized_model, "", "");
 
diff --git a/lite/api/lite_multithread_test.cc b/lite/api/lite_multithread_test.cc
index addd512eb0039c43edeca562b8f568528aab76f9..12559d171ff3df808cf252e8e09c652246902abf 100644
--- a/lite/api/lite_multithread_test.cc
+++ b/lite/api/lite_multithread_test.cc
@@ -16,9 +16,6 @@
 #include <string>
 #include <vector>
 #include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
 #include "lite/api/test_helper.h"
 #include "lite/core/device_info.h"
 #include "lite/core/profile/timer.h"
diff --git a/lite/api/mobilenetv1_test.cc b/lite/api/mobilenetv1_test.cc
index bcc9644f81542ab6fb8a0badf8ecaea89fc8dedb..5342a36ec154b2bdde44fa72bc21e9d430ad4efe 100644
--- a/lite/api/mobilenetv1_test.cc
+++ b/lite/api/mobilenetv1_test.cc
@@ -53,9 +53,13 @@ void TestModel(const std::vector<Place>& valid_places,
     predictor.Run();
   }
 
-  auto start = GetCurrentUS();
+  double sum_duration = 0.0;  // millisecond;
   for (int i = 0; i < FLAGS_repeats; ++i) {
+    auto start = GetCurrentUS();
     predictor.Run();
+    auto duration = (GetCurrentUS() - start) / 1000.0;
+    sum_duration += duration;
+    VLOG(1) << "run_idx:" << i << " " << duration << " ms";
   }
 
   if (save_model) {
@@ -68,8 +72,7 @@ void TestModel(const std::vector<Place>& valid_places,
   LOG(INFO) << "================== Speed Report ===================";
   LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads
             << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
+            << ", spend " << sum_duration / FLAGS_repeats << " ms in average.";
 
   std::vector<std::vector<float>> ref;
   ref.emplace_back(std::vector<float>(
@@ -81,29 +84,63 @@ void TestModel(const std::vector<Place>& valid_places,
   auto* out = predictor.GetOutput(0);
   const auto* pdata = out->data<float>();
   int step = 50;
-#ifdef LITE_WITH_NPU
-  ASSERT_EQ(out->dims().production(), 1000);
-  double eps = 0.1;
-  for (int i = 0; i < ref.size(); ++i) {
-    for (int j = 0; j < ref[i].size(); ++j) {
-      auto result = pdata[j * step + (out->dims()[1] * i)];
-      auto diff = std::fabs((result - ref[i][j]) / ref[i][j]);
-      VLOG(3) << diff;
-      EXPECT_LT(diff, eps);
+
+  // Get target and check result
+  VLOG(1) << "valid_places.size():" << valid_places.size();
+  for (int i = 0; i < valid_places.size(); ++i) {
+    auto p = valid_places[i];
+    VLOG(1) << "valid_places[" << i << "]:" << p.DebugString();
+  }
+  auto first_target = valid_places[0].target;
+
+  if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
+    ASSERT_EQ(out->dims().production(), 1000);
+    double eps = first_target == TARGET(kOpenCL) ? 0.12 : 0.1;
+    for (int i = 0; i < ref.size(); ++i) {
+      for (int j = 0; j < ref[i].size(); ++j) {
+        auto result = pdata[j * step + (out->dims()[1] * i)];
+        auto diff = std::fabs((result - ref[i][j]) / ref[i][j]);
+        VLOG(3) << diff;
+        EXPECT_LT(diff, eps);
+      }
+    }
+  } else {
+    ASSERT_EQ(out->dims().size(), 2);
+    ASSERT_EQ(out->dims()[0], 1);
+    ASSERT_EQ(out->dims()[1], 1000);
+    double eps = 1e-6;
+    for (int i = 0; i < ref.size(); ++i) {
+      for (int j = 0; j < ref[i].size(); ++j) {
+        auto result = pdata[j * step + (out->dims()[1] * i)];
+        EXPECT_NEAR(result, ref[i][j], eps);
+      }
     }
   }
-#else
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 1);
-  ASSERT_EQ(out->dims()[1], 1000);
-  double eps = 1e-6;
-  for (int i = 0; i < ref.size(); ++i) {
-    for (int j = 0; j < ref[i].size(); ++j) {
-      auto result = pdata[j * step + (out->dims()[1] * i)];
-      EXPECT_NEAR(result, ref[i][j], eps);
+
+  // Get detailed result
+  size_t output_tensor_num = predictor.GetOutputNames().size();
+  VLOG(1) << "output tensor num:" << output_tensor_num;
+
+  for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
+    auto* output_tensor = predictor.GetOutput(tidx);
+    VLOG(1) << "============= output tensor " << tidx << " =============\n";
+    auto out_dims = output_tensor->dims();
+    auto out_data = output_tensor->data<float>();
+    auto out_mean = compute_mean<float>(out_data, out_dims.production());
+    auto out_std_dev = compute_standard_deviation<float>(
+        out_data, out_dims.production(), true, out_mean);
+
+    VLOG(1) << "output tensor dims:" << out_dims;
+    VLOG(1) << "output tensor elements num:" << out_dims.production();
+    VLOG(1) << "output tensor standard deviation:" << out_std_dev;
+    VLOG(1) << "output tensor mean value:" << out_mean;
+
+    // print result
+    for (int i = 0; i < out_dims.production(); ++i) {
+      VLOG(2) << "output_tensor->data<float>()[" << i
+              << "]:" << output_tensor->data<float>()[i];
     }
   }
-#endif
 }
 
 #ifdef LITE_WITH_NPU
@@ -130,7 +167,7 @@ TEST(MobileNetV1, test_arm) {
 #ifdef LITE_WITH_OPENCL
 TEST(MobileNetV1, test_opencl) {
   std::vector<Place> valid_places({
-      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)},
+      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
       Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
       Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
       Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
diff --git a/lite/api/mobilenetv2_test.cc b/lite/api/mobilenetv2_test.cc
index 012d6d48d9e6d3747f83a7f1089944bbaf359f71..465f82056c6bb80b706cfb7d875773d75735911b 100644
--- a/lite/api/mobilenetv2_test.cc
+++ b/lite/api/mobilenetv2_test.cc
@@ -54,9 +54,13 @@ void TestModel(const std::vector<Place>& valid_places,
     predictor.Run();
   }
 
-  auto start = GetCurrentUS();
+  double sum_duration = 0.0;  // millisecond;
   for (int i = 0; i < FLAGS_repeats; ++i) {
+    auto start = GetCurrentUS();
     predictor.Run();
+    auto duration = (GetCurrentUS() - start) / 1000.0;
+    sum_duration += duration;
+    VLOG(1) << "run_idx:" << i << " " << duration << " ms";
   }
 
   if (save_model) {
@@ -69,8 +73,7 @@ void TestModel(const std::vector<Place>& valid_places,
   LOG(INFO) << "================== Speed Report ===================";
   LOG(INFO) << "Model: " << model_dir << ", threads num " << FLAGS_threads
             << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
+            << ", spend " << sum_duration / FLAGS_repeats << " ms in average.";
 
   std::vector<std::vector<float>> ref;
   // i = 1
@@ -83,27 +86,63 @@ void TestModel(const std::vector<Place>& valid_places,
   auto* out = predictor.GetOutput(0);
   const auto* pdata = out->data<float>();
   int step = 50;
-#ifdef LITE_WITH_NPU
-  ASSERT_EQ(out->dims().production(), 1000);
-  double eps = 0.1;
-  for (int i = 0; i < ref.size(); ++i) {
-    for (int j = 0; j < ref[i].size(); ++j) {
-      auto result = pdata[j * step + (out->dims()[1] * i)];
-      auto diff = std::fabs((result - ref[i][j]) / ref[i][j]);
-      VLOG(3) << diff;
-      EXPECT_LT(diff, eps);
+
+  // Get target and check result
+  VLOG(1) << "valid_places.size():" << valid_places.size();
+  for (int i = 0; i < valid_places.size(); ++i) {
+    auto p = valid_places[i];
+    VLOG(1) << "valid_places[" << i << "]:" << p.DebugString();
+  }
+  auto first_target = valid_places[0].target;
+
+  if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
+    ASSERT_EQ(out->dims().production(), 1000);
+    double eps = first_target == TARGET(kOpenCL) ? 0.15 : 0.1;
+    for (int i = 0; i < ref.size(); ++i) {
+      for (int j = 0; j < ref[i].size(); ++j) {
+        auto result = pdata[j * step + (out->dims()[1] * i)];
+        auto diff = std::fabs((result - ref[i][j]) / ref[i][j]);
+        VLOG(3) << diff;
+        EXPECT_LT(diff, eps);
+      }
+    }
+  } else {
+    ASSERT_EQ(out->dims().size(), 2);
+    ASSERT_EQ(out->dims()[0], 1);
+    ASSERT_EQ(out->dims()[1], 1000);
+    double eps = 1e-6;
+    for (int i = 0; i < ref.size(); ++i) {
+      for (int j = 0; j < ref[i].size(); ++j) {
+        auto result = pdata[j * step + (out->dims()[1] * i)];
+        EXPECT_NEAR(result, ref[i][j], eps);
+      }
     }
   }
-#else
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 1);
-  ASSERT_EQ(out->dims()[1], 1000);
-  for (int i = 0; i < ref.size(); ++i) {
-    for (int j = 0; j < ref[i].size(); ++j) {
-      EXPECT_NEAR(pdata[j * step + (out->dims()[1] * i)], ref[i][j], 1e-6);
+
+  // Get detailed result
+  size_t output_tensor_num = predictor.GetOutputNames().size();
+  VLOG(1) << "output tensor num:" << output_tensor_num;
+
+  for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
+    auto* output_tensor = predictor.GetOutput(tidx);
+    VLOG(1) << "============= output tensor " << tidx << " =============\n";
+    auto out_dims = output_tensor->dims();
+    auto out_data = output_tensor->data<float>();
+    auto out_mean = compute_mean<float>(out_data, out_dims.production());
+    auto out_std_dev = compute_standard_deviation<float>(
+        out_data, out_dims.production(), true, out_mean);
+
+    VLOG(1) << "output tensor dims:" << out_dims;
+    VLOG(1) << "output tensor elements num:" << out_dims.production();
+    VLOG(1) << "output tensor standard deviation:" << out_std_dev;
+    VLOG(1) << "output tensor mean value:" << out_mean;
+
+    // print result
+    for (int i = 0; i < out_dims.production(); ++i) {
+      VLOG(2) << "output_tensor->data<float>()[" << i
+              << "]:" << output_tensor->data<float>()[i];
     }
   }
-#endif
 }
 
 #ifdef LITE_WITH_NPU
@@ -130,7 +169,7 @@ TEST(MobileNetV2, test_arm) {
 #ifdef LITE_WITH_OPENCL
 TEST(MobileNetV2, test_opencl) {
   std::vector<Place> valid_places({
-      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)},
+      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
       Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
       Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
       Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc
index 190890da4c109f39cc52ca5209cd952f8937f780..b0f7a0479f0db91b816838f9d0ee1cc31b9b232a 100644
--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -17,9 +17,6 @@
 #include <string>
 #include <vector>
 #include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
 #include "lite/api/test_helper.h"
 #include "lite/core/device_info.h"
 #include "lite/core/profile/timer.h"
@@ -141,7 +138,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
     std::ofstream out(FLAGS_arg_name + ".txt");
     for (size_t i = 0; i < arg_num; ++i) {
       sum += arg_tensor->data<float>()[i];
-      out << std::to_string(arg_tensor->data<float>()[i]) << "\n";
+      out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
     }
     LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
               << ", mean value is " << sum * 1. / arg_num;
diff --git a/lite/api/model_test_classify.cc b/lite/api/model_test_classify.cc
new file mode 100644
index 0000000000000000000000000000000000000000..375d249476bf5323d69ea41c3f11d07e9c8bc711
--- /dev/null
+++ b/lite/api/model_test_classify.cc
@@ -0,0 +1,335 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_api.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/device_info.h"
+#include "lite/core/profile/timer.h"
+#include "lite/utils/cp_logging.h"
+#include "lite/utils/string.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/basic_profiler.h"
+#endif  // LITE_WITH_PROFILE
+
+using paddle::lite::profile::Timer;
+
+DEFINE_string(input_shape,
+              "1,3,224,224",
+              "input shapes, separated by colon and comma");
+DEFINE_bool(use_optimize_nb,
+            false,
+            "optimized & naive buffer model for mobile devices");
+DEFINE_string(arg_name, "", "the arg name");
+
+DEFINE_string(threshold, "0.5", "threshold value default 0.5f");
+DEFINE_string(in_txt, "", "input text");
+DEFINE_string(out_txt, "", "output text");
+DEFINE_string(label_file, "", "label file path");
+DEFINE_int32(topk, 1, "topk num");
+
+namespace paddle {
+namespace lite_api {
+
+void OutputOptModel(const std::string& load_model_dir,
+                    const std::string& save_optimized_model_dir,
+                    const std::vector<std::vector<int64_t>>& input_shapes) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(load_model_dir);
+  config.set_valid_places({
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  // delete old optimized model
+  int ret = system(
+      paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
+          .c_str());
+  if (ret == 0) {
+    LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
+  }
+  predictor->SaveOptimizedModel(save_optimized_model_dir,
+                                LiteModelType::kNaiveBuffer);
+  LOG(INFO) << "Load model from " << load_model_dir;
+  LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
+}
+
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+std::vector<std::string> load_labels(std::string label_path) {
+  FILE* fp = fopen(label_path.c_str(), "r");
+  if (fp == nullptr) {
+    LOG(FATAL) << "load label file failed! " << label_path;
+  }
+  std::vector<std::string> labels;
+  while (!feof(fp)) {
+    char str[1024];
+    fgets(str, 1024, fp);
+    std::string str_s(str);
+
+    if (str_s.length() > 0) {
+      for (int i = 0; i < str_s.length(); i++) {
+        if (str_s[i] == ' ') {
+          std::string strr = str_s.substr(i, str_s.length() - i - 1);
+          labels.push_back(strr);
+          i = str_s.length();
+        }
+      }
+    }
+  }
+  fclose(fp);
+  return labels;
+}
+
+void print_topk(const float* scores,
+                const int size,
+                const int topk,
+                const std::vector<std::string> labels) {
+  std::vector<std::pair<float, int>> vec;
+  vec.resize(size);
+  for (int i = 0; i < size; i++) {
+    vec[i] = std::make_pair(scores[i], i);
+  }
+  std::partial_sort(vec.begin(),
+                    vec.begin() + topk,
+                    vec.end(),
+                    std::greater<std::pair<float, int>>());
+
+  // print topk and score
+  std::string name = FLAGS_out_txt + "_accu.txt";
+  FILE* fp = fopen(name.c_str(), "w");
+  fprintf(fp, "%d \n", topk);
+  for (int i = 0; i < topk; i++) {
+    float score = vec[i].first;
+    int index = vec[i].second;
+    fprintf(fp, "%d ", index);
+    fprintf(fp, "%f \n", score);
+    LOG(INFO) << i << ": " << index << "  " << labels[index] << "  " << score;
+  }
+  fclose(fp);
+}
+
+void Run(const std::vector<std::vector<int64_t>>& input_shapes,
+         const std::string& model_dir,
+         const PowerMode power_mode,
+         const int thread_num,
+         const int repeat,
+         const int warmup_times = 0) {
+  lite_api::MobileConfig config;
+  config.set_model_dir(model_dir);
+  config.set_power_mode(power_mode);
+  config.set_threads(thread_num);
+
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  bool flag_in = true;
+  bool flag_out = true;
+  if (FLAGS_in_txt == "") {
+    flag_in = false;
+  }
+  if (FLAGS_out_txt == "") {
+    flag_out = false;
+  }
+  printf("flag_in: %d, flag_out: %d \n", flag_in, flag_out);
+  for (int j = 0; j < input_shapes.size(); ++j) {
+    auto input_tensor = predictor->GetInput(j);
+    input_tensor->Resize(input_shapes[j]);
+    auto input_data = input_tensor->mutable_data<float>();
+    int input_num = 1;
+    for (int i = 0; i < input_shapes[j].size(); ++i) {
+      input_num *= input_shapes[j][i];
+    }
+
+    FILE* fp_r = nullptr;
+    if (flag_in) {
+      fp_r = fopen(FLAGS_in_txt.c_str(), "r");
+    }
+    for (int i = 0; i < input_num; ++i) {
+      if (flag_in) {
+        fscanf(fp_r, "%f\n", &input_data[i]);
+      } else {
+        input_data[i] = 1.f;
+      }
+    }
+    if (flag_in) {
+      fclose(fp_r);
+    }
+  }
+
+  for (int i = 0; i < warmup_times; ++i) {
+    predictor->Run();
+  }
+
+  Timer ti;
+  for (int j = 0; j < repeat; ++j) {
+    ti.Start();
+    predictor->Run();
+    float t = ti.Stop();
+    LOG(INFO) << "iter: " << j << ", time: " << t << " ms";
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << model_dir
+            << ", power_mode: " << static_cast<int>(power_mode)
+            << ", threads num " << thread_num << ", warmup: " << warmup_times
+            << ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg()
+            << " ms"
+            << ", min time: " << ti.LapTimes().Min() << " ms"
+            << ", max time: " << ti.LapTimes().Max() << " ms.";
+
+  auto output = predictor->GetOutput(0);
+  auto out = output->data<float>();
+  auto output_shape = output->shape();
+  int output_num = 1;
+  for (int i = 0; i < output_shape.size(); ++i) {
+    output_num *= output_shape[i];
+  }
+  // classify
+  printf("load_labels \n");
+  std::vector<std::string> labels = load_labels(FLAGS_label_file);
+  printf("print_topk \n");
+  print_topk(out, output_num, FLAGS_topk, labels);
+  LOG(INFO) << "output_num: " << output_num;
+  LOG(INFO) << "out " << out[0];
+  LOG(INFO) << "out " << out[1];
+  FILE* fp = nullptr;
+  if (flag_out) {
+    fp = fopen(FLAGS_out_txt.c_str(), "w");
+  }
+  double sum1 = 0.f;
+  for (int i = 0; i < output_num; ++i) {
+    if (flag_out) {
+      fprintf(fp, "%f\n", out[i]);
+    }
+    sum1 += out[i];
+  }
+  if (flag_out) {
+    fclose(fp);
+  }
+  printf("out mean: %f \n", sum1 / output_num);
+
+  FILE* fp_w = fopen("time.txt", "a+");
+  if (!fp_w) {
+    printf("open file failed \n");
+    return;
+  }
+  fprintf(fp_w,
+          "model: %s, threads: %d, avg: %f ms, min: %f ms, max: %f ms \n",
+          model_dir.c_str(),
+          thread_num,
+          ti.LapTimes().Avg(),
+          ti.LapTimes().Min(),
+          ti.LapTimes().Max());
+  fclose(fp_w);
+
+  // please turn off memory_optimize_pass to use this feature.
+  if (FLAGS_arg_name != "") {
+    auto arg_tensor = predictor->GetTensor(FLAGS_arg_name);
+    auto arg_shape = arg_tensor->shape();
+    int arg_num = 1;
+    std::ostringstream os;
+    os << "{";
+    for (int i = 0; i < arg_shape.size(); ++i) {
+      arg_num *= arg_shape[i];
+      os << arg_shape[i] << ",";
+    }
+    os << "}";
+    float sum = 0.;
+    std::ofstream out(FLAGS_arg_name + ".txt");
+    for (size_t i = 0; i < arg_num; ++i) {
+      sum += arg_tensor->data<float>()[i];
+      out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
+    }
+    LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
+              << ", mean value is " << sum * 1. / arg_num;
+  }
+}
+#endif
+
+}  // namespace lite_api
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir == "") {
+    LOG(INFO) << "usage: "
+              << "--model_dir /path/to/your/model";
+    exit(0);
+  }
+  std::string save_optimized_model_dir = "";
+  if (FLAGS_use_optimize_nb) {
+    save_optimized_model_dir = FLAGS_model_dir;
+  } else {
+    save_optimized_model_dir = FLAGS_model_dir + "opt2";
+  }
+
+  auto split_string =
+      [](const std::string& str_in) -> std::vector<std::string> {
+    std::vector<std::string> str_out;
+    std::string tmp_str = str_in;
+    while (!tmp_str.empty()) {
+      size_t next_offset = tmp_str.find(":");
+      str_out.push_back(tmp_str.substr(0, next_offset));
+      if (next_offset == std::string::npos) {
+        break;
+      } else {
+        tmp_str = tmp_str.substr(next_offset + 1);
+      }
+    }
+    return str_out;
+  };
+
+  auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
+    std::vector<int64_t> shape;
+    std::string tmp_str = str_shape;
+    while (!tmp_str.empty()) {
+      int dim = atoi(tmp_str.data());
+      shape.push_back(dim);
+      size_t next_offset = tmp_str.find(",");
+      if (next_offset == std::string::npos) {
+        break;
+      } else {
+        tmp_str = tmp_str.substr(next_offset + 1);
+      }
+    }
+    return shape;
+  };
+
+  LOG(INFO) << "input shapes: " << FLAGS_input_shape;
+  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
+  std::vector<std::vector<int64_t>> input_shapes;
+  for (int i = 0; i < str_input_shapes.size(); ++i) {
+    LOG(INFO) << "input shape: " << str_input_shapes[i];
+    input_shapes.push_back(get_shape(str_input_shapes[i]));
+  }
+
+  if (!FLAGS_use_optimize_nb) {
+    // Output optimized model
+    paddle::lite_api::OutputOptModel(
+        FLAGS_model_dir, save_optimized_model_dir, input_shapes);
+  }
+
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+  // Run inference using optimized model
+  paddle::lite_api::Run(
+      input_shapes,
+      save_optimized_model_dir,
+      static_cast<paddle::lite_api::PowerMode>(FLAGS_power_mode),
+      FLAGS_threads,
+      FLAGS_repeats,
+      FLAGS_warmup);
+#endif
+  return 0;
+}
diff --git a/lite/api/model_test_detection.cc b/lite/api/model_test_detection.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9be12b2c78c623a2b2c9852850576cc11815bd3
--- /dev/null
+++ b/lite/api/model_test_detection.cc
@@ -0,0 +1,349 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_api.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/device_info.h"
+#include "lite/core/profile/timer.h"
+#include "lite/utils/cp_logging.h"
+#include "lite/utils/string.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/basic_profiler.h"
+#endif  // LITE_WITH_PROFILE
+
+using paddle::lite::profile::Timer;
+
+DEFINE_string(input_shape,
+              "1,3,224,224",
+              "input shapes, separated by colon and comma");
+DEFINE_bool(use_optimize_nb,
+            false,
+            "optimized & naive buffer model for mobile devices");
+DEFINE_string(arg_name, "", "the arg name");
+
+DEFINE_string(threshold, "0.5", "threshold value default 0.5f");
+DEFINE_string(in_txt, "", "input text");
+DEFINE_string(out_txt, "", "output text");
+DEFINE_int32(orih, 1920, "input image height");
+DEFINE_int32(oriw, 1080, "input image width");
+
+namespace paddle {
+namespace lite_api {
+
+struct Object {
+  float x;
+  float y;
+  float width;
+  float height;
+  float class_id;
+  float prob;
+};
+
+void OutputOptModel(const std::string& load_model_dir,
+                    const std::string& save_optimized_model_dir,
+                    const std::vector<std::vector<int64_t>>& input_shapes) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(load_model_dir);
+  config.set_valid_places({
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  // delete old optimized model
+  int ret = system(
+      paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
+          .c_str());
+  if (ret == 0) {
+    LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
+  }
+  predictor->SaveOptimizedModel(save_optimized_model_dir,
+                                LiteModelType::kNaiveBuffer);
+  LOG(INFO) << "Load model from " << load_model_dir;
+  LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
+}
+
+void detect_choose(const float* dout,
+                   std::vector<int64_t> dims,
+                   const float thresh) {
+  std::string name = FLAGS_out_txt + "_accu.txt";
+  FILE* fp = fopen(name.c_str(), "w");
+  for (int iw = 0; iw < dims[0]; iw++) {
+    const float* values = dout + iw * dims[1];
+    if (values[1] > thresh) {  // pro > 0.01
+      fprintf(fp, "%f \n", values[0]);
+      fprintf(fp, "%f \n", values[1]);
+      fprintf(fp, "%f \n", values[2]);
+      fprintf(fp, "%f \n", values[3]);
+      fprintf(fp, "%f \n", values[4]);
+      fprintf(fp, "%f \n", values[5]);
+    }
+  }
+  fclose(fp);
+}
+void detect_object(const float* dout,
+                   std::vector<int64_t> dims,
+                   const float thresh,
+                   int orih,
+                   int oriw) {
+  std::vector<Object> objects;
+  for (int iw = 0; iw < dims[0]; iw++) {
+    Object object;
+    const float* values = dout + iw * dims[1];
+    object.class_id = values[0];
+    object.prob = values[1];
+    object.x = values[2] * oriw;
+    object.y = values[3] * orih;
+    object.width = values[4] * oriw - object.x;
+    object.height = values[5] * orih - object.y;
+    objects.push_back(object);
+  }
+  std::string name = FLAGS_out_txt + "_accu.txt";
+  FILE* fp = fopen(name.c_str(), "w");
+  for (int i = 0; i < objects.size(); ++i) {
+    Object object = objects.at(i);
+    if (object.prob > thresh && object.x > 0 && object.y > 0 &&
+        object.width > 0 && object.height > 0) {
+      if (object.x >= oriw || object.width >= oriw || object.y >= orih ||
+          object.height >= orih)
+        continue;
+      fprintf(fp, "%f \n", object.x);
+      fprintf(fp, "%f \n", object.y);
+      fprintf(fp, "%f \n", object.width);
+      fprintf(fp, "%f \n", object.height);
+      fprintf(fp, "%f \n", object.prob);
+      fprintf(fp, "%f \n", object.class_id);
+      LOG(INFO) << "object id: " << object.class_id << ", image size: " << oriw
+                << ", " << orih << ", detect object: " << object.prob
+                << ", location: x=" << object.x << ", y=" << object.y
+                << ", width=" << object.width << ", height=" << object.height;
+    }
+  }
+  fclose(fp);
+}
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+void Run(const std::vector<std::vector<int64_t>>& input_shapes,
+         const std::string& model_dir,
+         const PowerMode power_mode,
+         const int thread_num,
+         const int repeat,
+         const int warmup_times = 0) {
+  lite_api::MobileConfig config;
+  config.set_model_dir(model_dir);
+  config.set_power_mode(power_mode);
+  config.set_threads(thread_num);
+
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  bool flag_in = true;
+  bool flag_out = true;
+  if (FLAGS_in_txt == "") {
+    flag_in = false;
+  }
+  if (FLAGS_out_txt == "") {
+    flag_out = false;
+  }
+  printf("flag_in: %d, flag_out: %d \n", flag_in, flag_out);
+  for (int j = 0; j < input_shapes.size(); ++j) {
+    auto input_tensor = predictor->GetInput(j);
+    input_tensor->Resize(input_shapes[j]);
+    auto input_data = input_tensor->mutable_data<float>();
+    int input_num = 1;
+    for (int i = 0; i < input_shapes[j].size(); ++i) {
+      input_num *= input_shapes[j][i];
+    }
+
+    FILE* fp_r = nullptr;
+    if (flag_in) {
+      fp_r = fopen(FLAGS_in_txt.c_str(), "r");
+    }
+    for (int i = 0; i < input_num; ++i) {
+      if (flag_in) {
+        fscanf(fp_r, "%f\n", &input_data[i]);
+      } else {
+        input_data[i] = 1.f;
+      }
+    }
+    if (flag_in) {
+      fclose(fp_r);
+    }
+  }
+
+  for (int i = 0; i < warmup_times; ++i) {
+    predictor->Run();
+  }
+
+  Timer ti;
+  for (int j = 0; j < repeat; ++j) {
+    ti.Start();
+    predictor->Run();
+    float t = ti.Stop();
+    LOG(INFO) << "iter: " << j << ", time: " << t << " ms";
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << model_dir
+            << ", power_mode: " << static_cast<int>(power_mode)
+            << ", threads num " << thread_num << ", warmup: " << warmup_times
+            << ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg()
+            << " ms"
+            << ", min time: " << ti.LapTimes().Min() << " ms"
+            << ", max time: " << ti.LapTimes().Max() << " ms.";
+
+  auto output = predictor->GetOutput(0);
+  auto out = output->data<float>();
+  auto output_shape = output->shape();
+  // detect
+  detect_object(
+      out, output_shape, atof(FLAGS_threshold.data()), FLAGS_orih, FLAGS_oriw);
+  // detect_choose(out, output_shape, atof(FLAGS_threshold.data()));
+  LOG(INFO) << "out " << out[0];
+  LOG(INFO) << "out " << out[1];
+  int output_num = 1;
+  for (int i = 0; i < output_shape.size(); ++i) {
+    output_num *= output_shape[i];
+  }
+  LOG(INFO) << "output_num: " << output_num;
+  FILE* fp = nullptr;
+  if (flag_out) {
+    fp = fopen(FLAGS_out_txt.c_str(), "w");
+  }
+  double sum1 = 0.f;
+  for (int i = 0; i < output_num; ++i) {
+    if (flag_out) {
+      fprintf(fp, "%f\n", out[i]);
+    }
+    sum1 += out[i];
+  }
+  if (flag_out) {
+    fclose(fp);
+  }
+
+  printf("out mean: %f \n", sum1 / output_num);
+
+  FILE* fp_w = fopen("time.txt", "a+");
+  if (!fp_w) {
+    printf("open file failed \n");
+    return;
+  }
+  fprintf(fp_w,
+          "model: %s, threads: %d, avg: %f ms, min: %f ms, max: %f ms \n",
+          model_dir.c_str(),
+          thread_num,
+          ti.LapTimes().Avg(),
+          ti.LapTimes().Min(),
+          ti.LapTimes().Max());
+  fclose(fp_w);
+
+  // please turn off memory_optimize_pass to use this feature.
+  if (FLAGS_arg_name != "") {
+    auto arg_tensor = predictor->GetTensor(FLAGS_arg_name);
+    auto arg_shape = arg_tensor->shape();
+    int arg_num = 1;
+    std::ostringstream os;
+    os << "{";
+    for (int i = 0; i < arg_shape.size(); ++i) {
+      arg_num *= arg_shape[i];
+      os << arg_shape[i] << ",";
+    }
+    os << "}";
+    float sum = 0.;
+    std::ofstream out(FLAGS_arg_name + ".txt");
+    for (size_t i = 0; i < arg_num; ++i) {
+      sum += arg_tensor->data<float>()[i];
+      out << paddle::lite::to_string(arg_tensor->data<float>()[i]) << "\n";
+    }
+    LOG(INFO) << FLAGS_arg_name << " shape is " << os.str()
+              << ", mean value is " << sum * 1. / arg_num;
+  }
+}
+#endif
+
+}  // namespace lite_api
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir == "") {
+    LOG(INFO) << "usage: "
+              << "--model_dir /path/to/your/model";
+    exit(0);
+  }
+  std::string save_optimized_model_dir = "";
+  if (FLAGS_use_optimize_nb) {
+    save_optimized_model_dir = FLAGS_model_dir;
+  } else {
+    save_optimized_model_dir = FLAGS_model_dir + "opt2";
+  }
+
+  auto split_string =
+      [](const std::string& str_in) -> std::vector<std::string> {
+    std::vector<std::string> str_out;
+    std::string tmp_str = str_in;
+    while (!tmp_str.empty()) {
+      size_t next_offset = tmp_str.find(":");
+      str_out.push_back(tmp_str.substr(0, next_offset));
+      if (next_offset == std::string::npos) {
+        break;
+      } else {
+        tmp_str = tmp_str.substr(next_offset + 1);
+      }
+    }
+    return str_out;
+  };
+
+  auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
+    std::vector<int64_t> shape;
+    std::string tmp_str = str_shape;
+    while (!tmp_str.empty()) {
+      int dim = atoi(tmp_str.data());
+      shape.push_back(dim);
+      size_t next_offset = tmp_str.find(",");
+      if (next_offset == std::string::npos) {
+        break;
+      } else {
+        tmp_str = tmp_str.substr(next_offset + 1);
+      }
+    }
+    return shape;
+  };
+
+  LOG(INFO) << "input shapes: " << FLAGS_input_shape;
+  std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
+  std::vector<std::vector<int64_t>> input_shapes;
+  for (int i = 0; i < str_input_shapes.size(); ++i) {
+    LOG(INFO) << "input shape: " << str_input_shapes[i];
+    input_shapes.push_back(get_shape(str_input_shapes[i]));
+  }
+
+  if (!FLAGS_use_optimize_nb) {
+    // Output optimized model
+    paddle::lite_api::OutputOptModel(
+        FLAGS_model_dir, save_optimized_model_dir, input_shapes);
+  }
+
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+  // Run inference using optimized model
+  paddle::lite_api::Run(
+      input_shapes,
+      save_optimized_model_dir,
+      static_cast<paddle::lite_api::PowerMode>(FLAGS_power_mode),
+      FLAGS_threads,
+      FLAGS_repeats,
+      FLAGS_warmup);
+#endif
+  return 0;
+}
diff --git a/lite/api/ocr_attention_test.cc b/lite/api/ocr_attention_test.cc
index 5e39c5437c18990be9c6414695a94c6f2c9fcf20..ae45b8e2282d0946019d83a76298c0b0a61f9832 100644
--- a/lite/api/ocr_attention_test.cc
+++ b/lite/api/ocr_attention_test.cc
@@ -32,18 +32,10 @@ void TestModel(const std::vector<Place>& valid_places, bool use_npu = false) {
 
   predictor.Build(FLAGS_model_dir, "", "", valid_places);
 
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 1, 48, 512})));
-  auto* data = input_tensor->mutable_data<float>();
-  auto item_size = input_tensor->dims().production();
-  for (int i = 0; i < item_size; i++) {
-    data[i] = 1;
-  }
-
   auto* init_scores = predictor.GetInput(2);
   init_scores->Resize(DDim(std::vector<DDim::value_type>({1, 1})));
   auto* data_scores = init_scores->mutable_data<float>();
-  auto scores_size = input_tensor->dims().production();
+  auto scores_size = init_scores->dims().production();
   for (int i = 0; i < scores_size; i++) {
     data_scores[i] = 0;
   }
@@ -53,7 +45,7 @@ void TestModel(const std::vector<Place>& valid_places, bool use_npu = false) {
 
   auto* init_ids = predictor.GetInput(1);
   init_ids->Resize(DDim(std::vector<DDim::value_type>({1, 1})));
-  auto* data_ids = init_ids->mutable_data<float>();
+  auto* data_ids = init_ids->mutable_data<int64_t>();
   auto ids_size = init_ids->dims().production();
   for (int i = 0; i < ids_size; i++) {
     data_ids[i] = 0;
@@ -62,6 +54,13 @@ void TestModel(const std::vector<Place>& valid_places, bool use_npu = false) {
   std::vector<std::vector<uint64_t>> lod_i{{0, 1}, {0, 1}};
   *lod_ids = lod_i;
 
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 1, 48, 512})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1;
+  }
   for (int i = 0; i < FLAGS_warmup; ++i) {
     predictor.Run();
   }
@@ -102,6 +101,7 @@ void TestModel(const std::vector<Place>& valid_places, bool use_npu = false) {
 
 TEST(OcrAttention, test_arm) {
   std::vector<Place> valid_places({
+      Place{TARGET(kARM), PRECISION(kInt64)},
       Place{TARGET(kARM), PRECISION(kFloat)},
   });
 
diff --git a/lite/api/opt.cc b/lite/api/opt.cc
index a00646f4e11b68f0233a8b6009fbf847e9d50d63..b8497199684cb4f6d4cc602291be5762eb93f7f9 100644
--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -30,6 +30,7 @@
 #include "lite/model_parser/compatible_pb.h"
 #include "lite/model_parser/pb/program_desc.h"
 #include "lite/utils/cp_logging.h"
+#include "lite/utils/io.h"
 #include "lite/utils/string.h"
 #include "supported_kernel_op_info.h"  // NOLINT
 
@@ -66,7 +67,6 @@ DEFINE_string(valid_targets,
               "arm",
               "The targets this model optimized for, should be one of (arm, "
               "opencl, x86), splitted by space");
-DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
 DEFINE_bool(print_supported_ops,
             false,
             "Print supported operators on the inputed target");
@@ -87,10 +87,13 @@ std::vector<Place> ParserValidPlaces() {
   auto target_reprs = lite::Split(FLAGS_valid_targets, ",");
   for (auto& target_repr : target_reprs) {
     if (target_repr == "arm") {
-      valid_places.emplace_back(TARGET(kARM));
+      valid_places.emplace_back(
+          Place{TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW)});
+      valid_places.emplace_back(
+          Place{TARGET(kARM), PRECISION(kInt32), DATALAYOUT(kNCHW)});
     } else if (target_repr == "opencl") {
       valid_places.emplace_back(
-          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault)});
+          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
       valid_places.emplace_back(
           Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
       valid_places.emplace_back(
@@ -117,11 +120,6 @@ std::vector<Place> ParserValidPlaces() {
       << "At least one target should be set, should set the "
          "command argument 'valid_targets'";
 
-  if (FLAGS_prefer_int8_kernel) {
-    LOG(WARNING) << "Int8 mode is only support by ARM target";
-    valid_places.insert(valid_places.begin(),
-                        Place{TARGET(kARM), PRECISION(kInt8)});
-  }
   return valid_places;
 }
 
@@ -251,7 +249,6 @@ void PrintHelpInfo() {
       "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
       "        `--optimize_out=<output_optimize_model_dir>`\n"
       "        `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
-      "        `--prefer_int8_kernel=(true|false)`\n"
       "        `--record_tailoring_info=(true|false)`\n"
       "  Arguments of model checking and ops information:\n"
       "        `--print_all_ops=true`   Display all the valid operators of "
@@ -400,6 +397,7 @@ void Main() {
     return;
   }
 
+  lite::MkDirRecur(FLAGS_optimize_out);
   auto model_dirs = lite::ListDir(FLAGS_model_set_dir, true);
   if (model_dirs.size() == 0) {
     LOG(FATAL) << "[" << FLAGS_model_set_dir << "] does not contain any model";
@@ -454,7 +452,9 @@ int main(int argc, char** argv) {
   }
   google::ParseCommandLineFlags(&argc, &argv, false);
   paddle::lite_api::ParseInputCommand();
-  paddle::lite_api::CheckIfModelSupported();
+  if (FLAGS_model_set_dir == "") {
+    paddle::lite_api::CheckIfModelSupported();
+  }
   paddle::lite_api::Main();
   return 0;
 }
diff --git a/lite/api/opt_base.cc b/lite/api/opt_base.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bd86f486248a2daccde13da078ae3860d8e31169
--- /dev/null
+++ b/lite/api/opt_base.cc
@@ -0,0 +1,364 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/api/opt_base.h"
+#include "all_kernel_faked.cc"  // NOLINT
+
+namespace paddle {
+namespace lite_api {
+
+void OptBase::SetModelDir(const std::string& model_path) {
+  opt_config_.set_model_dir(model_path);
+}
+
+void OptBase::SetModelFile(const std::string& model_path) {
+  opt_config_.set_model_file(model_path);
+}
+
+void OptBase::SetParamFile(const std::string& param_path) {
+  opt_config_.set_param_file(param_path);
+}
+
+void OptBase::SetModelType(std::string optimize_out_type) {
+  if (optimize_out_type == "protobuf") {
+    model_type_ = LiteModelType::kProtobuf;
+  } else if (optimize_out_type == "naive_buffer") {
+    model_type_ = LiteModelType::kNaiveBuffer;
+  } else {
+    LOG(FATAL) << "Unsupported Model type :" << optimize_out_type;
+  }
+}
+
+void OptBase::SetValidPlaces(const std::string& valid_places) {
+  valid_places_.clear();
+  auto target_reprs = lite::Split(valid_places, ",");
+  for (auto& target_repr : target_reprs) {
+    if (target_repr == "arm") {
+      valid_places_.emplace_back(TARGET(kARM));
+    } else if (target_repr == "opencl") {
+      valid_places_.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
+      valid_places_.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
+      valid_places_.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)});
+      valid_places_.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)});
+      valid_places_.emplace_back(
+          TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
+    } else if (target_repr == "x86") {
+      valid_places_.emplace_back(TARGET(kX86));
+    } else if (target_repr == "npu") {
+      valid_places_.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "xpu") {
+      valid_places_.emplace_back(TARGET(kXPU));
+    } else {
+      LOG(FATAL) << lite::string_format(
+          "Wrong target '%s' found, please check the command flag "
+          "'valid_targets'",
+          target_repr.c_str());
+    }
+  }
+  CHECK(!valid_places_.empty())
+      << "At least one target should be set, should set the "
+         "command argument 'valid_targets'";
+}
+
+void OptBase::SetOptimizeOut(const std::string& optimized_out_path) {
+  optimize_out_path_ = optimized_out_path;
+}
+
+void OptBase::RunOptimize(bool record_strip_info) {
+  CheckIfModelSupported(false);
+  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
+  opt_config_.set_valid_places(valid_places_);
+  if (model_set_dir_ != "") {
+    RunOptimizeFromModelSet(record_strip_info);
+  } else {
+    auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
+    opt_predictor->SaveOptimizedModel(
+        optimize_out_path_, model_type_, record_strip_info);
+    auto resulted_model_name =
+        record_strip_info ? "information of striped model" : "optimized model";
+    std::cout << "Save the " << resulted_model_name
+              << " into :" << optimize_out_path_ << "successfully";
+  }
+}
+
+// collect ops info of modelset
+void CollectModelMetaInfo(const std::string& output_dir,
+                          const std::vector<std::string>& models,
+                          const std::string& filename) {
+  std::set<std::string> total;
+  for (const auto& name : models) {
+    std::string model_path =
+        lite::Join<std::string>({output_dir, name, filename}, "/");
+    auto lines = lite::ReadLines(model_path);
+    total.insert(lines.begin(), lines.end());
+  }
+  std::string output_path =
+      lite::Join<std::string>({output_dir, filename}, "/");
+  lite::WriteLines(std::vector<std::string>(total.begin(), total.end()),
+                   output_path);
+}
+
+void OptBase::SetModelSetDir(const std::string& model_set_path) {
+  model_set_dir_ = model_set_path;
+}
+void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
+  // 1. mkdir of outputed optimized model set.
+  lite::MkDirRecur(optimize_out_path_);
+  auto model_dirs = lite::ListDir(model_set_dir_, true);
+  if (model_dirs.size() == 0) {
+    LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model";
+  }
+
+  // 2. optimize each model in inputed model set dir.
+  std::string model_file = opt_config_.model_file();
+  std::string param_file = opt_config_.param_file();
+  for (const auto& name : model_dirs) {
+    std::string input_model_dir =
+        lite::Join<std::string>({model_set_dir_, name}, "/");
+    std::string output_model_dir =
+        lite::Join<std::string>({optimize_out_path_, name}, "/");
+
+    if (opt_config_.model_file() != "" && opt_config_.param_file() != "") {
+      auto model_file_path =
+          lite::Join<std::string>({input_model_dir, model_file}, "/");
+      auto param_file_path =
+          lite::Join<std::string>({input_model_dir, param_file}, "/");
+    }
+
+    std::cout << "Start optimize model: " << input_model_dir;
+
+    opt_config_.set_model_dir(input_model_dir);
+    opt_config_.set_model_file(model_file);
+    opt_config_.set_param_file(param_file);
+
+    auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
+    opt_predictor->SaveOptimizedModel(
+        optimize_out_path_, model_type_, record_strip_info);
+
+    std::cout << "Optimize done. ";
+  }
+
+  // 3. if record_strip_info = true, we will record striping info
+  if (record_strip_info) {
+    // Collect all models information
+    CollectModelMetaInfo(
+        optimize_out_path_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
+    CollectModelMetaInfo(
+        optimize_out_path_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
+    CollectModelMetaInfo(optimize_out_path_,
+                         model_dirs,
+                         lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
+    CollectModelMetaInfo(
+        optimize_out_path_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
+    std::cout << "Record the information of stripped models into :"
+              << optimize_out_path_ << "successfully";
+  }
+}
+
+void OptBase::PrintHelpInfo() {
+  const std::string opt_version = lite::version();
+  const char help_info[] =
+      "At least one argument should be inputed. Valid arguments are listed "
+      "below:\n"
+      "  Arguments of help information:\n"
+      "        `help()`   Print help infomation\n"
+      "  Arguments of model optimization:\n"
+      "        `set_model_dir(model_dir)`\n"
+      "        `set_model_file(model_file_path)`\n"
+      "        `set_param_file(param_file_path)`\n"
+      "        `set_model_type(protobuf|naive_buffer)`\n"
+      "        `set_optimize_out(output_optimize_model_dir)`\n"
+      "        `set_valid_places(arm|opencl|x86|npu|xpu)`\n"
+      "        `run_optimize(false|true)`\n"
+      "        `  ----fasle&true refer to whether to record ops info for "
+      "tailoring lib, false by default`\n"
+      "  Arguments of model checking and ops information:\n"
+      "        `print_all_ops()`   Display all the valid operators of "
+      "Paddle-Lite\n"
+      "        `print_supported_ops`   Display supported operators of valid "
+      "places\n"
+      "        `check_if_model_supported()`   Check if the input model is "
+      "supported\n";
+
+  std::cout << "opt version:" << opt_version << std::endl
+            << help_info << std::endl;
+}
+// 2. Print supported info of inputed ops
+void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
+  std::vector<std::string> lite_supported_targets = {"kHost",
+                                                     "kX86",
+                                                     "kCUDA",
+                                                     "kARM",
+                                                     "kOpenCL",
+                                                     "kFPGA",
+                                                     "kNPU",
+                                                     "kXPU",
+                                                     "kAny",
+                                                     "kUnk"};
+  // Get the lengh of the first column: maximum length of the op_type
+  size_t maximum_optype_length = 0;
+  for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) {
+    maximum_optype_length = it->first.size() > maximum_optype_length
+                                ? it->first.size()
+                                : maximum_optype_length;
+  }
+  std::cout << std::setiosflags(std::ios::internal);
+  // Print the first row: OP_nam taget1 target2 ...
+  std::cout << std::setw(maximum_optype_length) << "OP_name";
+  for (size_t i = 0; i < lite_supported_targets.size(); i++) {
+    std::cout << std::setw(10) << lite_supported_targets[i].substr(1);
+  }
+  std::cout << std::endl;
+  // Print the name of supported ops and mark if it's supported by each target
+  // print the support info of inputed ops: valid_ops
+  for (auto op = valid_ops.begin(); op != valid_ops.end(); op++) {
+    std::cout << std::setw(maximum_optype_length) << *op;
+    // Check: If this kernel doesn't match any operator, we will skip it.
+    if (supported_ops.find(*op) == supported_ops.end()) {
+      continue;
+    }
+    // Print OP info.
+    auto ops_valid_places = supported_ops.at(*op);
+    for (size_t i = 0; i < lite_supported_targets.size(); i++) {
+      if (std::find(ops_valid_places.begin(),
+                    ops_valid_places.end(),
+                    lite_supported_targets[i]) != ops_valid_places.end()) {
+        std::cout << std::setw(10) << "Y";
+      } else {
+        std::cout << std::setw(10) << " ";
+      }
+    }
+    std::cout << std::endl;
+  }
+}
+
+void OptBase::DisplayKernelsInfo() {  // Display kernel information
+  std::cout << ::paddle::lite::KernelRegistry::Global().DebugString();
+}
+void OptBase::PrintAllOps() {
+  // 1. Get supported ops on these targets
+  std::set<std::string> valid_ops;
+  for (size_t i = 0; i < supported_ops_target.size(); i++) {
+    auto ops = supported_ops_target[i];
+    valid_ops.insert(ops.begin(), ops.end());
+  }
+  // 2. Print support info of these ops
+  PrintOpsInfo(valid_ops);
+}
+
+void OptBase::PrintSupportedOps() {
+  // 1. Get the valid hardware targets
+  std::vector<TargetType> target_types = {};
+  for (size_t i = 0; i < valid_places_.size(); i++) {
+    target_types.push_back(valid_places_[i].target);
+  }
+  std::string targets_str = TargetToStr(target_types[0]);
+  for (size_t i = 1; i < target_types.size(); i++) {
+    targets_str = targets_str + TargetToStr(target_types[i]);
+  }
+  std::cout << "Supported OPs on '" << targets_str << "': " << std::endl;
+  target_types.push_back(TARGET(kHost));
+  target_types.push_back(TARGET(kUnk));
+
+  // 2. Get supported ops on these targets
+  std::set<std::string> valid_ops;
+  for (size_t i = 0; i < target_types.size(); i++) {
+    auto ops = supported_ops_target[static_cast<int>(target_types[i])];
+    valid_ops.insert(ops.begin(), ops.end());
+  }
+  // 3. Print support info of these ops
+  PrintOpsInfo(valid_ops);
+}
+
+// test whether this model is supported
+void OptBase::CheckIfModelSupported(bool print_ops_info) {
+  // 1. parse valid places and valid targets
+  auto valid_ops = supported_ops_target[static_cast<int>(TARGET(kHost))];
+  auto valid_unktype_ops = supported_ops_target[static_cast<int>(TARGET(kUnk))];
+  valid_ops.insert(
+      valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end());
+  for (size_t i = 0; i < valid_places_.size(); i++) {
+    auto target = valid_places_[i].target;
+    auto ops = supported_ops_target[static_cast<int>(target)];
+    valid_ops.insert(valid_ops.end(), ops.begin(), ops.end());
+  }
+  // get valid ops
+  std::set<std::string> valid_ops_set(valid_ops.begin(), valid_ops.end());
+
+  // 2.Load model into program to get ops in model
+  std::string prog_path = opt_config_.model_dir() + "/__model__";
+  if (!(opt_config_.model_file()).empty() &&
+      !(opt_config_.param_file()).empty()) {
+    prog_path = opt_config_.model_file();
+  }
+  lite::cpp::ProgramDesc cpp_prog;
+  framework::proto::ProgramDesc pb_proto_prog =
+      *lite::LoadProgram(prog_path, false);
+  lite::pb::ProgramDesc pb_prog(&pb_proto_prog);
+  // Transform to cpp::ProgramDesc
+  lite::TransformProgramDescAnyToCpp(pb_prog, &cpp_prog);
+
+  std::set<std::string> unsupported_ops;
+  std::set<std::string> input_model_ops;
+  for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) {
+    auto current_block = cpp_prog.GetBlock<lite::cpp::BlockDesc>(index);
+    for (size_t i = 0; i < current_block->OpsSize(); ++i) {
+      auto& op_desc = *current_block->GetOp<lite::cpp::OpDesc>(i);
+      auto op_type = op_desc.Type();
+      input_model_ops.insert(op_type);
+      if (valid_ops_set.count(op_type) == 0) {
+        unsupported_ops.insert(op_type);
+      }
+    }
+  }
+  // 3. Print ops_info of input model and check if this model is supported
+  if (print_ops_info) {
+    std::cout << "OPs in the input model include:\n";
+    PrintOpsInfo(input_model_ops);
+  }
+  if (!unsupported_ops.empty()) {
+    std::string unsupported_ops_str = *unsupported_ops.begin();
+    for (auto op_str = ++unsupported_ops.begin();
+         op_str != unsupported_ops.end();
+         op_str++) {
+      unsupported_ops_str = unsupported_ops_str + ", " + *op_str;
+    }
+    std::vector<TargetType> targets = {};
+    for (size_t i = 0; i < valid_places_.size(); i++) {
+      targets.push_back(valid_places_[i].target);
+    }
+    std::sort(targets.begin(), targets.end());
+    targets.erase(unique(targets.begin(), targets.end()), targets.end());
+    std::string targets_str = TargetToStr(targets[0]);
+    for (size_t i = 1; i < targets.size(); i++) {
+      targets_str = targets_str + "," + TargetToStr(targets[i]);
+    }
+
+    LOG(ERROR) << "Error: This model is not supported, because "
+               << unsupported_ops.size() << " ops are not supported on '"
+               << targets_str << "'. These unsupported ops are: '"
+               << unsupported_ops_str << "'.";
+    exit(1);
+  }
+  if (print_ops_info) {
+    std::cout << "Paddle-Lite supports this model!" << std::endl;
+    exit(1);
+  }
+}
+}  // namespace lite_api
+}  // namespace paddle
diff --git a/lite/api/opt_base.h b/lite/api/opt_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8d6d0390ccd3f1c9b0291b1bcf6eb1ecc47a248
--- /dev/null
+++ b/lite/api/opt_base.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines Opt and basic functions about model transformation.
+ */
+
+#ifndef PADDLE_LITE_OPT_H_  // NOLINT
+#define PADDLE_LITE_OPT_H_
+#include <algorithm>
+#include <iomanip>
+#include <set>
+#include <string>
+#include <vector>
+// stores the map that records the source_file path of each kernel.
+#include "kernel_src_map.h"  // NOLINT
+#include "lite/api/cxx_api.h"
+// version of Paddle-lite
+#include "lite/core/version.h"
+// model parser functions to pre-load model to verify if this model is supported
+#include "lite/model_parser/compatible_pb.h"
+#include "lite/model_parser/pb/program_desc.h"
+#include "lite/utils/string.h"
+// recorded all the ops supported by paddle-lite
+#include "supported_kernel_op_info.h"  // NOLINT
+
+namespace paddle {
+namespace lite_api {
+
+/// The PaddlePredictor defines the basic interfaces for different kinds of
+/// predictors.
+class LITE_API OptBase {
+ public:
+  OptBase() = default;
+  void SetModelSetDir(const std::string &model_set_path);
+  void SetModelDir(const std::string &model_path);
+  void SetModelFile(const std::string &model_path);
+  void SetParamFile(const std::string &param_path);
+  void SetValidPlaces(const std::string &valid_places);
+  void SetOptimizeOut(const std::string &optimized_out_path);
+  // set optimized_model type
+  void SetModelType(std::string model_type);
+  // transform and save the optimized model
+  void RunOptimize(bool record_strip_info = false);
+
+  // fuctions of printing info
+  // 1. help info
+  void PrintHelpInfo();
+  // 2. PrintOpsInfo
+  void PrintOpsInfo(const std::set<std::string> &valid_ops =
+                        {});  // print supported ops on target_types
+  void PrintAllOps();         // print all ops
+  void PrintSupportedOps();   // print ops supported on valid_places_
+  void DisplayKernelsInfo();  // Display kernel information
+  // 3. Check if this model is supported
+  void CheckIfModelSupported(bool print_ops_info = true);
+
+ private:
+  CxxConfig opt_config_;
+  // valid places for the optimized_model
+  std::vector<Place> valid_places_;
+  // filename of the optimized_model
+  std::string optimize_out_path_;
+  // type of the optimized_model, kNaiveBuffer default.
+  LiteModelType model_type_{LiteModelType::kNaiveBuffer};
+  // Dir path of a set of models, this should be combined with model
+  std::string model_set_dir_;
+
+  void RunOptimizeFromModelSet(bool record_strip_info = false);
+};
+
+}  // namespace lite_api
+}  // namespace paddle
+
+#endif  // NOLINT
diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index 9f071cf7780e27defdd1fcd6be02844618165fb6..2cb2064da518bca442e882d0733c5c6966c4fac0 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -38,6 +38,7 @@ void Tensor::Resize(const shape_t &shape) {
   tensor(raw_tensor_)->Resize(shape);
 }
 
+// Tensor::data
 template <>
 const float *Tensor::data() const {
   return ctensor(raw_tensor_)->data<float>();
@@ -47,15 +48,19 @@ const int8_t *Tensor::data() const {
   return ctensor(raw_tensor_)->data<int8_t>();
 }
 template <>
+const uint8_t *Tensor::data() const {
+  return ctensor(raw_tensor_)->data<uint8_t>();
+}
+template <>
 const int64_t *Tensor::data() const {
   return ctensor(raw_tensor_)->data<int64_t>();
 }
-
 template <>
 const int32_t *Tensor::data() const {
   return ctensor(raw_tensor_)->data<int32_t>();
 }
 
+// Tensor::mutable_data
 template <>
 int *Tensor::mutable_data(TargetType type) const {
   return tensor(raw_tensor_)->mutable_data<int>(type);
@@ -69,6 +74,10 @@ int8_t *Tensor::mutable_data(TargetType type) const {
   return tensor(raw_tensor_)->mutable_data<int8_t>(type);
 }
 template <>
+uint8_t *Tensor::mutable_data(TargetType type) const {
+  return tensor(raw_tensor_)->mutable_data<uint8_t>(type);
+}
+template <>
 int64_t *Tensor::mutable_data(TargetType type) const {
   return tensor(raw_tensor_)->mutable_data<int64_t>(type);
 }
@@ -116,18 +125,22 @@ void Tensor::CopyToCpu(T *data) const {
 template void Tensor::CopyFromCpu<int, TargetType::kHost>(const int *);
 template void Tensor::CopyFromCpu<float, TargetType::kHost>(const float *);
 template void Tensor::CopyFromCpu<int8_t, TargetType::kHost>(const int8_t *);
+template void Tensor::CopyFromCpu<uint8_t, TargetType::kHost>(const uint8_t *);
 
 template void Tensor::CopyFromCpu<int, TargetType::kARM>(const int *);
 template void Tensor::CopyFromCpu<float, TargetType::kARM>(const float *);
 template void Tensor::CopyFromCpu<int8_t, TargetType::kARM>(const int8_t *);
+template void Tensor::CopyFromCpu<uint8_t, TargetType::kARM>(const uint8_t *);
+
 template void Tensor::CopyFromCpu<int, TargetType::kCUDA>(const int *);
 template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
 template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
 template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);
 
-template void Tensor::CopyToCpu(int8_t *) const;
 template void Tensor::CopyToCpu(float *) const;
 template void Tensor::CopyToCpu(int *) const;
+template void Tensor::CopyToCpu(int8_t *) const;
+template void Tensor::CopyToCpu(uint8_t *) const;
 
 shape_t Tensor::shape() const {
   return ctensor(raw_tensor_)->dims().Vectorize();
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index 307eeb74e8b4cdc3b2d6188eb18490e4dcf89b8f..c445ef641b96d9fbbc5b4123be794976c0cf03c4 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -206,7 +206,7 @@ class LITE_API MobileConfig : public ConfigBase {
 };
 
 template <typename ConfigT>
-std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&);
+LITE_API std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&);
 
 }  // namespace lite_api
 }  // namespace paddle
diff --git a/lite/api/paddle_api_test.cc b/lite/api/paddle_api_test.cc
index 9213a24e5c0614550a098c4de8d97b6cf6695177..9b8384f2823ee121aa8bb505dd135735d9f96774 100644
--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -15,9 +15,6 @@
 #include "lite/api/paddle_api.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/io.h"
 DEFINE_string(model_dir, "", "");
diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc
index 2cced919e601f8ecb79ce262a2b083d5b6862da9..dba65656cbcffb00319c8f6083909e487e3df7a2 100644
--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -45,6 +45,21 @@ std::string Place::DebugString() const {
   return os.str();
 }
 
+const std::string& ActivationTypeToStr(ActivationType act) {
+  static const std::string act2string[] = {"unk",
+                                           "Relu",
+                                           "Relu6",
+                                           "PRelu",
+                                           "LeakyRelu",
+                                           "Sigmoid",
+                                           "Tanh",
+                                           "Swish",
+                                           "Exp"};
+  auto x = static_cast<int>(act);
+  CHECK_LT(x, static_cast<int>(ActivationType::NUM));
+  return act2string[x];
+}
+
 const std::string& TargetToStr(TargetType target) {
   static const std::string target2string[] = {"unk",
                                               "host",
diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h
index 7da52adc7fb6fdd70de3b098508e4622496bed7d..1de46a39467af125e705cfcb7a9eeae64a0be133 100644
--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -54,7 +54,8 @@ enum class TargetType : int {
   kXPU = 9,
   kBM = 10,
   kAny = 6,  // any target
-  NUM = 11,  // number of fields.
+  kMLU = 11,
+  NUM = 12,  // number of fields.
 };
 enum class PrecisionType : int {
   kUnk = 0,
@@ -96,7 +97,9 @@ enum class ActivationType : int {
   kLeakyRelu = 4,
   kSigmoid = 5,
   kTanh = 6,
-  kSwish = 7
+  kSwish = 7,
+  kExp = 8,
+  NUM = 9,
 };
 
 static size_t PrecisionTypeLength(PrecisionType type) {
@@ -148,6 +151,8 @@ _ForEachPrecisionType(DefinePrecisionTypeTrait);
 #define PRECISION(item__) paddle::lite_api::PrecisionType::item__
 #define DATALAYOUT(item__) paddle::lite_api::DataLayoutType::item__
 
+const std::string& ActivationTypeToStr(ActivationType act);
+
 const std::string& TargetToStr(TargetType target);
 
 const std::string& PrecisionToStr(PrecisionType precision);
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index 943760d30742b74a0fe9150e4c2d8c8bb5dbc52a..41eca021a9ded40134122cb7b68604d9cd8f9fc2 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -24,7 +24,7 @@ USE_MIR_PASS(generate_program_pass);
 USE_MIR_PASS(io_copy_kernel_pick_pass);
 USE_MIR_PASS(argument_type_display_pass);
 USE_MIR_PASS(runtime_context_assign_pass);
-USE_MIR_PASS(graph_visualze);
+USE_MIR_PASS(graph_visualize_pass);
 
 USE_MIR_PASS(lite_conv_bn_fuse_pass);
 USE_MIR_PASS(lite_fc_fuse_pass);
@@ -40,8 +40,10 @@ USE_MIR_PASS(lite_elementwise_add_activation_fuse_pass);
 USE_MIR_PASS(lite_quant_dequant_fuse_pass);
 USE_MIR_PASS(type_precision_cast_pass);
 USE_MIR_PASS(type_layout_cast_pass);
+USE_MIR_PASS(type_layout_cast_preprocess_pass);
 USE_MIR_PASS(memory_optimize_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);
 USE_MIR_PASS(xpu_subgraph_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
+USE_MIR_PASS(quantized_op_attributes_inference_pass);
diff --git a/lite/api/python/CMakeLists.txt b/lite/api/python/CMakeLists.txt
index 43178a37c663bb09acb7c025e021cbc91bf0cc5d..ba0c6eb2404ce1ffc2ad5950ee5a3476d42f01b8 100644
--- a/lite/api/python/CMakeLists.txt
+++ b/lite/api/python/CMakeLists.txt
@@ -2,6 +2,23 @@ if (NOT LITE_WITH_PYTHON)
     return()
 endif()
 
+# to create setup.py for packeting whl for Paddle-Lite and opt
+
+execute_process(
+  COMMAND git describe --tags --exact-match
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_LITE_TAG
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+execute_process(
+  COMMAND git log -1 --format=%h
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_LITE_COMMIT
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
 add_subdirectory(pybind)
 #add_subdirectory(interface)
diff --git a/lite/api/python/__init__.py b/lite/api/python/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/lite/api/python/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/lite/api/python/pybind/CMakeLists.txt b/lite/api/python/pybind/CMakeLists.txt
index eabb6b150b93a722282118c3932676cd1aee5da8..b1de18d50c1582b0f872ad38d24939665ab1d3b0 100644
--- a/lite/api/python/pybind/CMakeLists.txt
+++ b/lite/api/python/pybind/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(PYBIND_DEPS pybind python paddle_api_light paddle_api)
 if (NOT LITE_ON_TINY_PUBLISH)
-   set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full)
+   set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base)
 endif()
 
 lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc
index 2dfe0c49490ecd13e8a3ce480807bdf3875348b7..e86d570e18b50bdc3d8943ecdd3732f8475ad56c 100644
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -26,13 +26,11 @@
 
 #ifndef LITE_ON_TINY_PUBLISH
 #include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_passes.h"
+#include "lite/api/opt_base.h"
 #endif
 
 #include "lite/api/light_api.h"
 #include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
 #include "lite/core/tensor.h"
 
 namespace py = pybind11;
@@ -50,10 +48,27 @@ using lite_api::PrecisionType;
 using lite_api::DataLayoutType;
 using lite_api::Place;
 using lite::LightPredictorImpl;
+using lite_api::OptBase;
 
 #ifndef LITE_ON_TINY_PUBLISH
 using lite::CxxPaddleApiImpl;
 static void BindLiteCxxPredictor(py::module *m);
+void BindLiteOpt(py::module *m) {
+  py::class_<OptBase> opt_base(*m, "Opt");
+  opt_base.def(py::init<>())
+      .def("set_model_dir", &OptBase::SetModelDir)
+      .def("set_modelset_dir", &OptBase::SetModelSetDir)
+      .def("set_model_file", &OptBase::SetModelFile)
+      .def("set_param_file", &OptBase::SetParamFile)
+      .def("set_valid_places", &OptBase::SetValidPlaces)
+      .def("set_optimize_out", &OptBase::SetOptimizeOut)
+      .def("set_model_type", &OptBase::SetModelType)
+      .def("run_optimize", &OptBase::RunOptimize)
+      .def("help", &OptBase::PrintHelpInfo)
+      .def("print_supported_ops", &OptBase::PrintSupportedOps)
+      .def("display_kernels_info", &OptBase::DisplayKernelsInfo)
+      .def("print_all_ops", &OptBase::PrintAllOps);
+}
 #endif
 static void BindLiteLightPredictor(py::module *m);
 static void BindLiteCxxConfig(py::module *m);
diff --git a/lite/api/python/pybind/pybind.h b/lite/api/python/pybind/pybind.h
index ca05f24b32fd0b0418d9cf595fe6134b34fa725f..15609957e05391be54466262f962e151594ef383 100644
--- a/lite/api/python/pybind/pybind.h
+++ b/lite/api/python/pybind/pybind.h
@@ -22,11 +22,15 @@ namespace lite {
 namespace pybind {
 
 void BindLiteApi(pybind11::module *m);
+void BindLiteOpt(pybind11::module *m);
 
-PYBIND11_MODULE(lite_core, m) {
+PYBIND11_MODULE(lite, m) {
   m.doc() = "C++ core of Paddle-Lite";
 
   BindLiteApi(&m);
+#ifndef LITE_ON_TINY_PUBLISH
+  BindLiteOpt(&m);
+#endif
 }
 
 }  // namespace pybind
diff --git a/lite/api/python/setup.py.in b/lite/api/python/setup.py.in
new file mode 100644
index 0000000000000000000000000000000000000000..79028fb7493bf55eab74aa76ee51ac79f418ba0a
--- /dev/null
+++ b/lite/api/python/setup.py.in
@@ -0,0 +1,72 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# module of pack whl installer for Paddle-lite
+
+import shutil
+import os
+from setuptools import setup, Distribution
+
+
+class BinaryDistribution(Distribution):
+    'binary distribution'
+    def has_ext_modules(foo):
+        return True
+
+
+# get paddle-lite version, if it's not based on a release tag, we use commit id instead
+PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@"
+PADDLELITE_TAG = "@PADDLE_LITE_TAG@"
+if PADDLELITE_TAG == "":
+    PADDLELITE_VERSION = PADDLELITE_COMMITE
+else:
+    PADDLELITE_VERSION = PADDLELITE_TAG
+
+# core lib of paddlelite is stored as lite.so
+LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
+PACKAGE_DATA = {'paddlelite': ['lite.so']}
+# put all thirdparty libraries in paddlelite.libs
+PACKAGE_DATA['paddlelite.libs'] = []
+LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
+if '${WITH_MKL}' == 'ON':
+    shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
+    shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
+    PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
+
+# link lite.so to paddlelite.libs
+COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
+/inference_lite_lib/python/install/lite/lite.so"
+if os.system(COMMAND) != 0:
+    raise Exception("patch third_party libs failed, command: %s" % COMMAND)
+
+# remove unused paddle/libs/__init__.py
+if os.path.isfile(LIB_PATH+'/__init__.py'):
+    os.remove(LIB_PATH+'/__init__.py')
+
+# set dir path of each package
+PACKAGE_DIR = {
+    # The paddle.fluid.proto will be generated while compiling.
+    # So that package points to other directory.
+    'paddlelite.libs': LIB_PATH,
+    'paddlelite': LITE_PATH
+}
+
+setup(
+    name='paddlelite',
+    version=PADDLELITE_VERSION,
+    description='Paddle-Lite Library',
+    packages=['paddlelite', 'paddlelite.libs'],
+    package_dir=PACKAGE_DIR,
+    package_data=PACKAGE_DATA,
+    distclass=BinaryDistribution
+)
diff --git a/lite/api/test_resnet50_lite_bm.cc b/lite/api/test_classify_lite_bm.cc
similarity index 97%
rename from lite/api/test_resnet50_lite_bm.cc
rename to lite/api/test_classify_lite_bm.cc
index 62a58704f4245b8618540ea7109447dd99d0bfea..7da7dc03745aa623e35dec5b344e16de03cf5aca 100644
--- a/lite/api/test_resnet50_lite_bm.cc
+++ b/lite/api/test_classify_lite_bm.cc
@@ -33,7 +33,6 @@ namespace lite {
 void TestModel(const std::vector<Place>& valid_places) {
   lite::Predictor predictor;
   std::vector<std::string> passes;
-  passes.push_back("bm_subgraph_pass");
   predictor.Build(FLAGS_model_dir, "", "", valid_places, passes);
 
   auto* input_tensor = predictor.GetInput(0);
@@ -81,7 +80,7 @@ void TestModel(const std::vector<Place>& valid_places) {
   fclose(fp);
 }
 
-TEST(ResNet50, test_bm) {
+TEST(Classify, test_bm) {
   std::vector<Place> valid_places({Place{TARGET(kBM), PRECISION(kFloat)},
                                    Place{TARGET(kX86), PRECISION(kFloat)}});
 
diff --git a/lite/api/test_helper.h b/lite/api/test_helper.h
index 71752c942bb53e7f2ed289ac0d965ae1d1007c55..a17fc331310cfe17ec36be504b94ddacc724e90f 100644
--- a/lite/api/test_helper.h
+++ b/lite/api/test_helper.h
@@ -17,6 +17,7 @@
 #include <gflags/gflags.h>
 #include <sys/time.h>
 #include <time.h>
+#include <cmath>
 
 // for eval
 DEFINE_string(model_dir, "", "model dir");
@@ -43,5 +44,31 @@ inline double GetCurrentUS() {
   return 1e+6 * time.tv_sec + time.tv_usec;
 }
 
+template <typename T>
+double compute_mean(const T* in, const size_t length) {
+  double sum = 0.;
+  for (size_t i = 0; i < length; ++i) {
+    sum += in[i];
+  }
+  return sum / length;
+}
+
+template <typename T>
+double compute_standard_deviation(const T* in,
+                                  const size_t length,
+                                  bool has_mean = false,
+                                  double mean = 10000) {
+  if (!has_mean) {
+    mean = compute_mean<T>(in, length);
+  }
+
+  double variance = 0.;
+  for (size_t i = 0; i < length; ++i) {
+    variance += pow((in[i] - mean), 2);
+  }
+  variance /= length;
+  return sqrt(variance);
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/api/transform_test.cc b/lite/api/transform_test.cc
index 8e51f3778d30ba9fcfde493c3e27ecc973e66a59..896b47a97fb20e6935764e12fbe9ebd646a4f816 100644
--- a/lite/api/transform_test.cc
+++ b/lite/api/transform_test.cc
@@ -28,11 +28,10 @@ DEFINE_int32(batch, 1, "batch");
 
 namespace paddle {
 namespace lite {
-namespace test_transformer {
 
+namespace test_transformer {
 std::vector<std::string> inputed_lines;
-
-void LoadInputLines(const char* filename) {
+void load_input_lines(const char* filename) {
   static const int max_line_buf_size = 100 * 1024 * 1024;
   char* line_buffer = (char*)calloc(max_line_buf_size, sizeof(char));  // NOLINT
   FILE* input_file = fopen(filename, "r");
@@ -49,7 +48,7 @@ void LoadInputLines(const char* filename) {
   line_buffer = NULL;
   fclose(input_file);
 }
-void Split2(const std::string& main_str,
+void split2(const std::string& main_str,
             std::vector<std::string>& str_list,  // NOLINT
             const std::string& delimiter) {
   size_t pre_pos = 0;
@@ -75,19 +74,19 @@ void Split2(const std::string& main_str,
 }
 }  // NOLINT
 
-void PadBatchInput(std::vector<std::string>& input_lines,  // NOLINT
-                   int pad_idx,
-                   int n_head,
-                   Tensor* src_word,
-                   Tensor* src_pos,
-                   Tensor* src_attn_bias,
-                   Tensor* trg_word,
-                   Tensor* init_scores,
-                   Tensor* init_idx,
-                   Tensor* trg_bias,
-                   int line_start,
-                   int batch_size,
-                   int bos_idx) {
+void pad_batch_input(std::vector<std::string>& input_lines,  // NOLINT
+                     int pad_idx,
+                     int n_head,
+                     Tensor* src_word,
+                     Tensor* src_pos,
+                     Tensor* src_attn_bias,
+                     Tensor* trg_word,
+                     Tensor* init_scores,
+                     Tensor* init_idx,
+                     Tensor* trg_bias,
+                     int line_start,
+                     int batch_size,
+                     int bos_idx) {
   int max_len = 0;
   int max_line = input_lines.size();
 
@@ -98,27 +97,27 @@ void PadBatchInput(std::vector<std::string>& input_lines,  // NOLINT
 
     std::vector<std::string> split_str;
 
-    test_transformer::Split2(cur_line, split_str, " ");
+    test_transformer::split2(cur_line, split_str, " ");
 
     batch_lines.push_back(split_str);
     max_len = max_len >= split_str.size() ? max_len : split_str.size();
   }
 
-  src_word->Resize(std::vector<DDim::value_type>({batch_size, max_len, 1}));
-  src_pos->Resize(std::vector<DDim::value_type>({batch_size, max_len, 1}));
+  src_word->Resize(std::vector<DDim::value_type>({batch_size, max_len}));
+  src_pos->Resize(std::vector<DDim::value_type>({batch_size, max_len}));
   src_attn_bias->Resize(
       std::vector<DDim::value_type>({batch_size, n_head, max_len, max_len}));
   trg_bias->Resize(
-      std::vector<DDim::value_type>({batch_size, n_head, 1, max_len}));
-  float* src_word_data = src_word->mutable_data<float>();
-  float* src_pos_data = src_pos->mutable_data<float>();
+      std::vector<DDim::value_type>({batch_size, n_head, max_len, max_len}));
+  auto* src_word_data = src_word->mutable_data<int64_t>();
+  auto* src_pos_data = src_pos->mutable_data<int64_t>();
   float* src_bias_data = src_attn_bias->mutable_data<float>();
   float* trg_bias_data = trg_bias->mutable_data<float>();
   for (int i = 0; i < batch_size; ++i) {
     std::vector<std::string> cur_words = batch_lines[i];
     int fill_len = cur_words.size();
     int src_bias_start = i * n_head * max_len * max_len;
-    int trg_bias_start = i * n_head * max_len;
+    int trg_bias_start = i * n_head * max_len * max_len;
     for (int j = 0; j < fill_len; ++j) {
       src_word_data[i * max_len + j] = (atoi(cur_words[j].c_str()));
       src_pos_data[i * max_len + j] = j;
@@ -137,22 +136,24 @@ void PadBatchInput(std::vector<std::string>& input_lines,  // NOLINT
       int value_ind = j % max_len + src_bias_start;
       src_bias_data[j] = src_bias_data[value_ind];
     }
-    for (int j = trg_bias_start; j < trg_bias_start + n_head * max_len; ++j) {
+    for (int j = trg_bias_start;
+         j < trg_bias_start + n_head * max_len * max_len;
+         ++j) {
       int value_ind = j % max_len + trg_bias_start;
       trg_bias_data[j] = trg_bias_data[value_ind];
     }
   }
 
-  trg_word->Resize(std::vector<DDim::value_type>({batch_size, 1, 1}));
-  auto* trg_word_data = trg_word->mutable_data<float>();
-  for (int i = 0; i < batch_size; ++i) {
+  trg_word->Resize(std::vector<DDim::value_type>({batch_size, max_len}));
+  auto* trg_word_data = trg_word->mutable_data<int64_t>();
+  for (int i = 0; i < batch_size * max_len; ++i) {
     trg_word_data[i] = bos_idx;
   }
 
   init_scores->Resize(std::vector<DDim::value_type>({batch_size, 1}));
   init_idx->Resize(std::vector<DDim::value_type>({batch_size}));
   float* score_data = init_scores->mutable_data<float>();
-  float* idx_data = init_idx->mutable_data<float>();
+  auto* idx_data = init_idx->mutable_data<int32_t>();
   for (int i = 0; i < init_scores->numel(); ++i) {
     score_data[i] = 0;
   }
@@ -175,21 +176,25 @@ void PadBatchInput(std::vector<std::string>& input_lines,  // NOLINT
 void TestModel(const std::vector<Place>& valid_places,
                const Place& preferred_place,
                bool use_npu = false) {
+#ifdef LITE_WITH_ARM
   DeviceInfo::Init();
   DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
+#endif
   lite::Predictor predictor;
   std::string test_data_path = FLAGS_input;
 
-  predictor.Build(FLAGS_model_dir, "", "", preferred_place, valid_places);
+  predictor.Build("",
+                  FLAGS_model_dir + "/__model__",
+                  FLAGS_model_dir + "/weights",
+                  valid_places);
+  // predictor.Build(FLAGS_model_dir, "", "", valid_places);
 
   int n_head = 8;
   int batch_size = FLAGS_batch;
   int bos_idx = 0;
   int eos_idx = 1;
-  LOG(INFO) << "reading";
 
-  test_transformer::LoadInputLines(test_data_path.c_str());
-  LOG(INFO) << "reading finished";
+  test_transformer::load_input_lines(test_data_path.c_str());
 
   auto* trg_bias = predictor.GetInput(6);
   auto* src_word = predictor.GetInput(0);
@@ -205,28 +210,31 @@ void TestModel(const std::vector<Place>& valid_places,
 
   auto start = GetCurrentUS();
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    auto start_i = GetCurrentUS();
-    PadBatchInput(test_transformer::inputed_lines,
-                  eos_idx,
-                  n_head,
-                  src_word,    // src_word
-                  src_pos,     // src_pos
-                  src_bias,    // src_bias
-                  trg_word,    // trg_word
-                  init_score,  // init_score
-                  init_idx,    // init_idx
-                  trg_bias,    // trg_bias
-                  i * batch_size,
-                  batch_size,
-                  bos_idx);
-    LOG(INFO) << "src_word:" << src_word->dims();
-    auto start_ii = GetCurrentUS();
-    LOG(INFO) << i << "->ii:" << (start_ii - start_i) / 1000.0;
+    pad_batch_input(test_transformer::inputed_lines,
+                    eos_idx,
+                    n_head,
+                    src_word,    // src_word
+                    src_pos,     // src_pos
+                    src_bias,    // src_bias
+                    trg_word,    // trg_word
+                    init_score,  // init_score
+                    init_idx,    // init_idx
+                    trg_bias,    // trg_bias
+                    i * batch_size,
+                    batch_size,
+                    bos_idx);
     predictor.Run();
-    auto start_iii = GetCurrentUS();
-    LOG(INFO) << i << "->iii:" << (start_iii - start_ii) / 1000.0;
-    auto* outs = predictor.GetOutputs();
-    LOG(INFO) << "out:" << (*outs)[0].dims();
+    auto* outs = predictor.GetOutput(0);
+    auto o_data = outs->data<int64_t>();
+    auto lod = outs->lod();
+    for (int i = 0; i < outs->numel(); ++i) {
+      LOG(INFO) << o_data[i];
+    }
+    for (int i = 0; i < lod.size(); ++i) {
+      for (int j = 0; j < lod[i].size(); ++j) {
+        LOG(INFO) << lod[i][j];
+      }
+    }
   }
 
   LOG(INFO) << "================== Speed Report ===================";
@@ -234,25 +242,18 @@ void TestModel(const std::vector<Place>& valid_places,
             << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
             << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
             << " ms in average.";
-
-  auto* outs = predictor.GetOutputs();
-  for (auto out : *outs) {
-    LOG(INFO) << "======"
-              << "here";
-    LOG(INFO) << out;
-  }
-  LOG(INFO) << "======"
-            << "hereggg";
 }
 
-TEST(OcrAttention, test_arm) {
+}  // namespace lite
+}  // namespace paddle
+using namespace paddle::lite;  // NOLINT
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
   std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kInt64)},
       Place{TARGET(kARM), PRECISION(kFloat)},
+      Place{TARGET(kHost), PRECISION(kFloat)},
   });
 
   TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)}));
 }
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt
index 6f6f7e7aa71ba5067d831a2bcc2b7b933205fbe0..aecec295ae0269fb34a3c4fa38e396bdf98d4418 100644
--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -68,6 +68,8 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       gemv_arm_int8.cc
       conv3x3s1_direct_fp32.cc
       conv3x3s2_direct_fp32.cc
+      conv3x3s1p01_depthwise_fp32_relu.cc
+      conv3x3s2p01_depthwise_fp32_relu.cc
       conv3x3s1p01_depthwise_fp32.cc
       conv3x3s2p01_depthwise_fp32.cc
       conv3x3s1px_depthwise_fp32.cc
@@ -123,5 +125,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       anchor_generator.cc
       split_merge_lod_tenosr.cc
       reduce_prod.cc
+      lstm.cc
       DEPS ${lite_kernel_deps} context tensor)
 endif()
diff --git a/lite/backends/arm/math/activation.cc b/lite/backends/arm/math/activation.cc
index 634021cc3ce82bbb5fba72123b38457ab0c7ac06..9f478eab60538eeca38415afea4e0989eff5a04e 100644
--- a/lite/backends/arm/math/activation.cc
+++ b/lite/backends/arm/math/activation.cc
@@ -700,6 +700,35 @@ void act_rsqrt<float>(const float* din, float* dout, int size, int threads) {
   }
 }
 
+template <>
+void act_square<float>(const float* din, float* dout, int size, int threads) {
+  const float* ptr_in = din;
+  float* ptr_out = dout;
+  for (int i = 0; i < size; ++i) {
+    ptr_out[0] = ptr_in[0] * ptr_in[0];
+    ptr_in++;
+    ptr_out++;
+  }
+}
+
+#ifdef LITE_WITH_TRAIN
+template <>
+void act_square_grad(const float* din,
+                     const float* dout_grad,
+                     float* din_grad,
+                     int size,
+                     int threads) {
+  const float* ptr_out_grad = dout_grad;
+  float* ptr_in_grad = din_grad;
+  for (int i = 0; i < size; ++i) {
+    ptr_in_grad[0] = ptr_out_grad[0] * 2.0 * din[0];
+    ptr_out_grad++;
+    ptr_in_grad++;
+    din++;
+  }
+}
+#endif
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/activation.h b/lite/backends/arm/math/activation.h
index bb8189eef0d81a92caf2aaf73e401e20d9c80155..63f4418d70db25f98dea2a405de1f4bb6b0b9111 100644
--- a/lite/backends/arm/math/activation.h
+++ b/lite/backends/arm/math/activation.h
@@ -69,6 +69,15 @@ void act_hard_sigmoid(const T* din,
 template <typename T>
 void act_rsqrt(const T* din, T* dout, int size, int threads);
 
+template <typename T>
+void act_square(const T* din, T* dout, int size, int threads);
+
+#ifdef LITE_WITH_TRAIN
+template <typename T>
+void act_square_grad(
+    const T* din, const T* dout_grad, T* din_grad, int size, int threads);
+#endif
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/argmax.cc b/lite/backends/arm/math/argmax.cc
index 3ca6d97c4d8ab97ca58e9859bfd753f7bf7f05ad..4177ad0ae05a5f29be56e9e277c0161841ba6124 100644
--- a/lite/backends/arm/math/argmax.cc
+++ b/lite/backends/arm/math/argmax.cc
@@ -53,7 +53,7 @@ void argmax_func(const lite::Tensor *input,
                         std::greater<std::pair<float, int>>());
 
       // out
-      float *out_ptr = output->mutable_data<float>() + n * out_channel + k;
+      int64_t *out_ptr = output->mutable_data<int64_t>() + n * out_channel + k;
       *out_ptr = vec[0].second;
     }
   }
diff --git a/lite/backends/arm/math/beam_search.cc b/lite/backends/arm/math/beam_search.cc
index f93fcc0d601cc076163e4d6fb1e31fc58e7035a8..32b7d3bfeba6107493d62a0c9be14a3c15ce7692 100644
--- a/lite/backends/arm/math/beam_search.cc
+++ b/lite/backends/arm/math/beam_search.cc
@@ -70,7 +70,7 @@ void PruneEndBeams(const Tensor *pre_ids,
                    std::vector<std::vector<Item>> *items,
                    size_t lod_level,
                    int end_id) {
-  auto *pre_ids_data = pre_ids->data<float>();
+  auto *pre_ids_data = pre_ids->data<int64_t>();
   auto &high_level = abs_lod[lod_level];
   for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
     size_t src_prefix_start = high_level[src_idx];
@@ -152,10 +152,10 @@ std::vector<std::vector<Item>> SelectTopBeamSizeItems(const Tensor *pre_ids,
   // find the current candidates
   // auto abs_lod = framework::ToAbsOffset(scores->lod());
   auto abs_lod = scores->lod();
-  auto *pre_ids_data = pre_ids->data<float>();
+  auto *pre_ids_data = pre_ids->data<int64_t>();
   auto *pre_scores_data = pre_scores->data<float>();
 
-  auto *ids_data = ids ? ids->data<int>() : nullptr;
+  auto *ids_data = ids ? ids->data<int64_t>() : nullptr;
   auto *scores_data = scores->data<float>();
 
   size_t num_seqs = abs_lod[lod_level].size() - 1;
@@ -236,7 +236,7 @@ void beam_search(const Tensor *pre_ids,
   if (parent_idx) {
     parent_idx->Resize(dims);
   }
-  auto *selected_ids_data = selected_ids->mutable_data<float>();
+  auto *selected_ids_data = selected_ids->mutable_data<int64_t>();
   auto *selected_scores_data = selected_scores->mutable_data<float>();
   auto *parent_idx_data =
       parent_idx ? parent_idx->mutable_data<int>() : nullptr;
diff --git a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
index 66d61413fc43fd518e0b34c7bc8d7b7bf5cc72a7..b024d69507101e902dc45fb83668e00dc718a6b0 100644
--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
@@ -91,23 +91,20 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                bool flag_bias,
                                const operators::ActivationParam act_param,
                                ARMContext *ctx) {
+  bool has_active = act_param.has_active;
+  bool flag_relu = false;
+  bool relu6 = false;
+  if (has_active) {
+    if (act_param.active_type == lite_api::ActivationType::kRelu) {
+      flag_relu = true;
+    } else {
+      relu6 = true;
+    }
+  }
   if (pad == 0) {
     if (w_in > 5) {
-      conv_depthwise_3x3s1p0_bias(dout,
-                                  din,
-                                  weights,
-                                  bias,
-                                  flag_bias,
-                                  num,
-                                  ch_in,
-                                  h_in,
-                                  w_in,
-                                  h_out,
-                                  w_out,
-                                  act_param,
-                                  ctx);
-    } else {
-      conv_depthwise_3x3s1p0_bias_s(dout,
+      if (relu6) {
+        conv_depthwise_3x3s1p0_bias(dout,
                                     din,
                                     weights,
                                     bias,
@@ -120,25 +117,57 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                     w_out,
                                     act_param,
                                     ctx);
+      } else {
+        conv_depthwise_3x3s1p0_bias_relu(dout,
+                                         din,
+                                         weights,
+                                         bias,
+                                         flag_bias,
+                                         flag_relu,
+                                         num,
+                                         ch_in,
+                                         h_in,
+                                         w_in,
+                                         h_out,
+                                         w_out,
+                                         ctx);
+      }
+    } else {
+      if (relu6) {
+        conv_depthwise_3x3s1p0_bias_s(dout,
+                                      din,
+                                      weights,
+                                      bias,
+                                      flag_bias,
+                                      num,
+                                      ch_in,
+                                      h_in,
+                                      w_in,
+                                      h_out,
+                                      w_out,
+                                      act_param,
+                                      ctx);
+      } else {
+        conv_depthwise_3x3s1p0_bias_s_relu(dout,
+                                           din,
+                                           weights,
+                                           bias,
+                                           flag_bias,
+                                           flag_relu,
+                                           num,
+                                           ch_in,
+                                           h_in,
+                                           w_in,
+                                           h_out,
+                                           w_out,
+                                           ctx);
+      }
     }
   }
   if (pad == 1) {
     if (w_in > 4) {
-      conv_depthwise_3x3s1p1_bias(dout,
-                                  din,
-                                  weights,
-                                  bias,
-                                  flag_bias,
-                                  num,
-                                  ch_in,
-                                  h_in,
-                                  w_in,
-                                  h_out,
-                                  w_out,
-                                  act_param,
-                                  ctx);
-    } else {
-      conv_depthwise_3x3s1p1_bias_s(dout,
+      if (relu6) {
+        conv_depthwise_3x3s1p1_bias(dout,
                                     din,
                                     weights,
                                     bias,
@@ -151,6 +180,51 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                     w_out,
                                     act_param,
                                     ctx);
+      } else {
+        conv_depthwise_3x3s1p1_bias_relu(dout,
+                                         din,
+                                         weights,
+                                         bias,
+                                         flag_bias,
+                                         flag_relu,
+                                         num,
+                                         ch_in,
+                                         h_in,
+                                         w_in,
+                                         h_out,
+                                         w_out,
+                                         ctx);
+      }
+    } else {
+      if (relu6) {
+        conv_depthwise_3x3s1p1_bias_s(dout,
+                                      din,
+                                      weights,
+                                      bias,
+                                      flag_bias,
+                                      num,
+                                      ch_in,
+                                      h_in,
+                                      w_in,
+                                      h_out,
+                                      w_out,
+                                      act_param,
+                                      ctx);
+      } else {
+        conv_depthwise_3x3s1p1_bias_s_relu(dout,
+                                           din,
+                                           weights,
+                                           bias,
+                                           flag_bias,
+                                           flag_relu,
+                                           num,
+                                           ch_in,
+                                           h_in,
+                                           w_in,
+                                           h_out,
+                                           w_out,
+                                           ctx);
+      }
     }
   }
 }
@@ -1924,223 +1998,169 @@ void act_switch_3x3s1p1(const float *din_ptr0,
                         float *vbias,
                         int cnt,
                         const operators::ActivationParam act_param) {
-  bool has_active = act_param.has_active;
-  if (has_active) {
-    float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
-    float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
-
-    switch (act_param.active_type) {
-      case lite_api::ActivationType::kRelu:
-        asm volatile(
-            INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
-                MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1),
-              [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [bias_val] "r"(vbias),
-              [vmask] "r"(vmask),
-              [rmask] "r"(rmask),
-              [vzero] "w"(vzero)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25");
-        break;
-      case lite_api::ActivationType::kRelu6:
-        /* 0 <= din <= 6 */
-        asm volatile(
-            INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1
-                MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1),
-              [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [vsix] "w"(vsix),
-              [bias_val] "r"(vbias),
-              [vmask] "r"(vmask),
-              [rmask] "r"(rmask),
-              [vzero] "w"(vzero)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25");
-        break;
-      case lite_api::ActivationType::kLeakyRelu:
-        /*din = din >= 0 ? din : din * scale*/
-        asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU
-                         MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
-                             RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_LEAKY_RELU
-                     : [cnt] "+r"(cnt),
-                       [din_ptr0] "+r"(din_ptr0),
-                       [din_ptr1] "+r"(din_ptr1),
-                       [din_ptr2] "+r"(din_ptr2),
-                       [din_ptr3] "+r"(din_ptr3),
-                       [din_ptr4] "+r"(din_ptr4),
-                       [din_ptr5] "+r"(din_ptr5),
-                       [doutr0] "+r"(doutr0),
-                       [doutr1] "+r"(doutr1),
-                       [doutr2] "+r"(doutr2),
-                       [doutr3] "+r"(doutr3)
-                     : [w0] "w"(wr0),
-                       [w1] "w"(wr1),
-                       [w2] "w"(wr2),
-                       [vscale] "w"(vscale),
-                       [bias_val] "r"(vbias),
-                       [vmask] "r"(vmask),
-                       [rmask] "r"(rmask),
-                       [vzero] "w"(vzero)
-                     : "cc",
-                       "memory",
-                       "v0",
-                       "v1",
-                       "v2",
-                       "v3",
-                       "v4",
-                       "v5",
-                       "v6",
-                       "v7",
-                       "v8",
-                       "v9",
-                       "v10",
-                       "v11",
-                       "v12",
-                       "v13",
-                       "v14",
-                       "v15",
-                       "v16",
-                       "v17",
-                       "v18",
-                       "v19",
-                       "v20",
-                       "v21",
-                       "v22",
-                       "v23",
-                       "v24",
-                       "v25");
-        break;
-      default:
-        LOG(FATAL) << "this act_type: "
-                   << static_cast<int>(act_param.active_type)
-                   << " fuse not support";
-    }
-  } else {
-    asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
-                     MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
-                 : [cnt] "+r"(cnt),
-                   [din_ptr0] "+r"(din_ptr0),
-                   [din_ptr1] "+r"(din_ptr1),
-                   [din_ptr2] "+r"(din_ptr2),
-                   [din_ptr3] "+r"(din_ptr3),
-                   [din_ptr4] "+r"(din_ptr4),
-                   [din_ptr5] "+r"(din_ptr5),
-                   [doutr0] "+r"(doutr0),
-                   [doutr1] "+r"(doutr1),
-                   [doutr2] "+r"(doutr2),
-                   [doutr3] "+r"(doutr3)
-                 : [w0] "w"(wr0),
-                   [w1] "w"(wr1),
-                   [w2] "w"(wr2),
-                   [bias_val] "r"(vbias),
-                   [vmask] "r"(vmask),
-                   [rmask] "r"(rmask),
-                   [vzero] "w"(vzero)
-                 : "cc",
-                   "memory",
-                   "v0",
-                   "v1",
-                   "v2",
-                   "v3",
-                   "v4",
-                   "v5",
-                   "v6",
-                   "v7",
-                   "v8",
-                   "v9",
-                   "v10",
-                   "v11",
-                   "v12",
-                   "v13",
-                   "v14",
-                   "v15",
-                   "v16",
-                   "v17",
-                   "v18",
-                   "v19",
-                   "v20",
-                   "v21",
-                   "v22",
-                   "v23",
-                   "v24",
-                   "v25");
+  float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
+  float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
+
+  switch (act_param.active_type) {
+    case lite_api::ActivationType::kRelu:
+      asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
+                       MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
+                   : [cnt] "+r"(cnt),
+                     [din_ptr0] "+r"(din_ptr0),
+                     [din_ptr1] "+r"(din_ptr1),
+                     [din_ptr2] "+r"(din_ptr2),
+                     [din_ptr3] "+r"(din_ptr3),
+                     [din_ptr4] "+r"(din_ptr4),
+                     [din_ptr5] "+r"(din_ptr5),
+                     [doutr0] "+r"(doutr0),
+                     [doutr1] "+r"(doutr1),
+                     [doutr2] "+r"(doutr2),
+                     [doutr3] "+r"(doutr3)
+                   : [w0] "w"(wr0),
+                     [w1] "w"(wr1),
+                     [w2] "w"(wr2),
+                     [bias_val] "r"(vbias),
+                     [vmask] "r"(vmask),
+                     [rmask] "r"(rmask),
+                     [vzero] "w"(vzero)
+                   : "cc",
+                     "memory",
+                     "v0",
+                     "v1",
+                     "v2",
+                     "v3",
+                     "v4",
+                     "v5",
+                     "v6",
+                     "v7",
+                     "v8",
+                     "v9",
+                     "v10",
+                     "v11",
+                     "v12",
+                     "v13",
+                     "v14",
+                     "v15",
+                     "v16",
+                     "v17",
+                     "v18",
+                     "v19",
+                     "v20",
+                     "v21",
+                     "v22",
+                     "v23",
+                     "v24",
+                     "v25");
+      break;
+    case lite_api::ActivationType::kRelu6:
+      /* 0 <= din <= 6 */
+      asm volatile(
+          INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1
+              MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6
+          : [cnt] "+r"(cnt),
+            [din_ptr0] "+r"(din_ptr0),
+            [din_ptr1] "+r"(din_ptr1),
+            [din_ptr2] "+r"(din_ptr2),
+            [din_ptr3] "+r"(din_ptr3),
+            [din_ptr4] "+r"(din_ptr4),
+            [din_ptr5] "+r"(din_ptr5),
+            [doutr0] "+r"(doutr0),
+            [doutr1] "+r"(doutr1),
+            [doutr2] "+r"(doutr2),
+            [doutr3] "+r"(doutr3)
+          : [w0] "w"(wr0),
+            [w1] "w"(wr1),
+            [w2] "w"(wr2),
+            [vsix] "w"(vsix),
+            [bias_val] "r"(vbias),
+            [vmask] "r"(vmask),
+            [rmask] "r"(rmask),
+            [vzero] "w"(vzero)
+          : "cc",
+            "memory",
+            "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23",
+            "v24",
+            "v25");
+      break;
+    case lite_api::ActivationType::kLeakyRelu:
+      /*din = din >= 0 ? din : din * scale*/
+      asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU
+                       MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU RIGHT_COMPUTE_S1
+                           RIGHT_RESULT_S1_LEAKY_RELU
+                   : [cnt] "+r"(cnt),
+                     [din_ptr0] "+r"(din_ptr0),
+                     [din_ptr1] "+r"(din_ptr1),
+                     [din_ptr2] "+r"(din_ptr2),
+                     [din_ptr3] "+r"(din_ptr3),
+                     [din_ptr4] "+r"(din_ptr4),
+                     [din_ptr5] "+r"(din_ptr5),
+                     [doutr0] "+r"(doutr0),
+                     [doutr1] "+r"(doutr1),
+                     [doutr2] "+r"(doutr2),
+                     [doutr3] "+r"(doutr3)
+                   : [w0] "w"(wr0),
+                     [w1] "w"(wr1),
+                     [w2] "w"(wr2),
+                     [vscale] "w"(vscale),
+                     [bias_val] "r"(vbias),
+                     [vmask] "r"(vmask),
+                     [rmask] "r"(rmask),
+                     [vzero] "w"(vzero)
+                   : "cc",
+                     "memory",
+                     "v0",
+                     "v1",
+                     "v2",
+                     "v3",
+                     "v4",
+                     "v5",
+                     "v6",
+                     "v7",
+                     "v8",
+                     "v9",
+                     "v10",
+                     "v11",
+                     "v12",
+                     "v13",
+                     "v14",
+                     "v15",
+                     "v16",
+                     "v17",
+                     "v18",
+                     "v19",
+                     "v20",
+                     "v21",
+                     "v22",
+                     "v23",
+                     "v24",
+                     "v25");
+      break;
+    default:
+      LOG(FATAL) << "this act_type: " << static_cast<int>(act_param.active_type)
+                 << " fuse not support";
   }
 }
 #else
@@ -2159,153 +2179,117 @@ void act_switch_3x3s1p1(const float *din_ptr0,
                         float bias_val,
                         int cnt,
                         const operators::ActivationParam act_param) {
-  bool has_active = act_param.has_active;
-  if (has_active) {
-    float tmp = act_param.Relu_clipped_coef;
-    float ss = act_param.Leaky_relu_alpha;
-    float vsix[4] = {tmp, tmp, tmp, tmp};
-    float vscale[4] = {ss, ss, ss, ss};
-
-    switch (act_param.active_type) {
-      case lite_api::ActivationType::kRelu:
-        asm volatile(
-            INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
-                MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
-            : [dout_ptr1] "+r"(doutr0),
-              [dout_ptr2] "+r"(doutr1),
-              [din0_ptr] "+r"(din_ptr0),
-              [din1_ptr] "+r"(din_ptr1),
-              [din2_ptr] "+r"(din_ptr2),
-              [din3_ptr] "+r"(din_ptr3),
-              [cnt] "+r"(cnt),
-              [rmask] "+r"(rmask_ptr),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias_val] "r"(bias_val),
-              [vzero] "w"(vzero)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-        break;
-      case lite_api::ActivationType::kRelu6:
-        /* 0 <= din <= 6 */
-        asm volatile(
-            INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1
-                MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6
-            : [dout_ptr1] "+r"(doutr0),
-              [dout_ptr2] "+r"(doutr1),
-              [din0_ptr] "+r"(din_ptr0),
-              [din1_ptr] "+r"(din_ptr1),
-              [din2_ptr] "+r"(din_ptr2),
-              [din3_ptr] "+r"(din_ptr3),
-              [cnt] "+r"(cnt),
-              [rmask] "+r"(rmask_ptr),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias_val] "r"(bias_val),
-              [six_ptr] "r"(vsix),
-              [vzero] "w"(vzero)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-        break;
-      case lite_api::ActivationType::kLeakyRelu:
-        /*din = din >= 0 ? din : din * scale*/
-        asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU
-                         MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
-                             RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_LEAKY_RELU
-                     : [dout_ptr1] "+r"(doutr0),
-                       [dout_ptr2] "+r"(doutr1),
-                       [din0_ptr] "+r"(din_ptr0),
-                       [din1_ptr] "+r"(din_ptr1),
-                       [din2_ptr] "+r"(din_ptr2),
-                       [din3_ptr] "+r"(din_ptr3),
-                       [cnt] "+r"(cnt),
-                       [rmask] "+r"(rmask_ptr),
-                       [vmask] "+r"(vmask_ptr)
-                     : [wr0] "w"(wr0),
-                       [wr1] "w"(wr1),
-                       [wr2] "w"(wr2),
-                       [bias_val] "r"(bias_val),
-                       [scale_ptr] "r"(vscale),
-                       [vzero] "w"(vzero)
-                     : "cc",
-                       "memory",
-                       "q4",
-                       "q5",
-                       "q6",
-                       "q7",
-                       "q8",
-                       "q9",
-                       "q10",
-                       "q11",
-                       "q12",
-                       "q13",
-                       "q14",
-                       "q15");
-        break;
-      default:
-        LOG(FATAL) << "this act_type: "
-                   << static_cast<int>(act_param.active_type)
-                   << " fuse not support";
-    }
-  } else {
-    asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
-                     MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
-                 : [dout_ptr1] "+r"(doutr0),
-                   [dout_ptr2] "+r"(doutr1),
-                   [din0_ptr] "+r"(din_ptr0),
-                   [din1_ptr] "+r"(din_ptr1),
-                   [din2_ptr] "+r"(din_ptr2),
-                   [din3_ptr] "+r"(din_ptr3),
-                   [cnt] "+r"(cnt),
-                   [rmask] "+r"(rmask_ptr),
-                   [vmask] "+r"(vmask_ptr)
-                 : [wr0] "w"(wr0),
-                   [wr1] "w"(wr1),
-                   [wr2] "w"(wr2),
-                   [bias_val] "r"(bias_val),
-                   [vzero] "w"(vzero)
-                 : "cc",
-                   "memory",
-                   "q4",
-                   "q5",
-                   "q6",
-                   "q7",
-                   "q8",
-                   "q9",
-                   "q10",
-                   "q11",
-                   "q12",
-                   "q13",
-                   "q14",
-                   "q15");
+  float tmp = act_param.Relu_clipped_coef;
+  float ss = act_param.Leaky_relu_alpha;
+  float vsix[4] = {tmp, tmp, tmp, tmp};
+  float vscale[4] = {ss, ss, ss, ss};
+
+  switch (act_param.active_type) {
+    case lite_api::ActivationType::kRelu:
+      asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
+                       MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
+                   : [dout_ptr1] "+r"(doutr0),
+                     [dout_ptr2] "+r"(doutr1),
+                     [din0_ptr] "+r"(din_ptr0),
+                     [din1_ptr] "+r"(din_ptr1),
+                     [din2_ptr] "+r"(din_ptr2),
+                     [din3_ptr] "+r"(din_ptr3),
+                     [cnt] "+r"(cnt),
+                     [rmask] "+r"(rmask_ptr),
+                     [vmask] "+r"(vmask_ptr)
+                   : [wr0] "w"(wr0),
+                     [wr1] "w"(wr1),
+                     [wr2] "w"(wr2),
+                     [bias_val] "r"(bias_val),
+                     [vzero] "w"(vzero)
+                   : "cc",
+                     "memory",
+                     "q4",
+                     "q5",
+                     "q6",
+                     "q7",
+                     "q8",
+                     "q9",
+                     "q10",
+                     "q11",
+                     "q12",
+                     "q13",
+                     "q14",
+                     "q15");
+      break;
+    case lite_api::ActivationType::kRelu6:
+      /* 0 <= din <= 6 */
+      asm volatile(
+          INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1
+              MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6
+          : [dout_ptr1] "+r"(doutr0),
+            [dout_ptr2] "+r"(doutr1),
+            [din0_ptr] "+r"(din_ptr0),
+            [din1_ptr] "+r"(din_ptr1),
+            [din2_ptr] "+r"(din_ptr2),
+            [din3_ptr] "+r"(din_ptr3),
+            [cnt] "+r"(cnt),
+            [rmask] "+r"(rmask_ptr),
+            [vmask] "+r"(vmask_ptr)
+          : [wr0] "w"(wr0),
+            [wr1] "w"(wr1),
+            [wr2] "w"(wr2),
+            [bias_val] "r"(bias_val),
+            [six_ptr] "r"(vsix),
+            [vzero] "w"(vzero)
+          : "cc",
+            "memory",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12",
+            "q13",
+            "q14",
+            "q15");
+      break;
+    case lite_api::ActivationType::kLeakyRelu:
+      /*din = din >= 0 ? din : din * scale*/
+      asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU
+                       MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU RIGHT_COMPUTE_S1
+                           RIGHT_RESULT_S1_LEAKY_RELU
+                   : [dout_ptr1] "+r"(doutr0),
+                     [dout_ptr2] "+r"(doutr1),
+                     [din0_ptr] "+r"(din_ptr0),
+                     [din1_ptr] "+r"(din_ptr1),
+                     [din2_ptr] "+r"(din_ptr2),
+                     [din3_ptr] "+r"(din_ptr3),
+                     [cnt] "+r"(cnt),
+                     [rmask] "+r"(rmask_ptr),
+                     [vmask] "+r"(vmask_ptr)
+                   : [wr0] "w"(wr0),
+                     [wr1] "w"(wr1),
+                     [wr2] "w"(wr2),
+                     [bias_val] "r"(bias_val),
+                     [scale_ptr] "r"(vscale),
+                     [vzero] "w"(vzero)
+                   : "cc",
+                     "memory",
+                     "q4",
+                     "q5",
+                     "q6",
+                     "q7",
+                     "q8",
+                     "q9",
+                     "q10",
+                     "q11",
+                     "q12",
+                     "q13",
+                     "q14",
+                     "q15");
+      break;
+    default:
+      LOG(FATAL) << "this act_type: " << static_cast<int>(act_param.active_type)
+                 << " fuse not support";
   }
 }
 #endif
@@ -2575,278 +2559,214 @@ void act_switch_3x3s1p1_s(const float *din_ptr0,
                           float32x4_t vzero,
                           float32x4_t wbias,
                           const operators::ActivationParam act_param) {
-  bool has_active = act_param.has_active;
-  if (has_active) {
 #ifdef __aarch64__
-    float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
-    float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
+  float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
+  float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
 #else
-    float tmp = act_param.Relu_clipped_coef;
-    float ss = act_param.Leaky_relu_alpha;
-    float vsix[4] = {tmp, tmp, tmp, tmp};
-    float vscale[4] = {ss, ss, ss, ss};
+  float tmp = act_param.Relu_clipped_coef;
+  float ss = act_param.Leaky_relu_alpha;
+  float vsix[4] = {tmp, tmp, tmp, tmp};
+  float vscale[4] = {ss, ss, ss, ss};
 #endif
-    switch (act_param.active_type) {
-      case lite_api::ActivationType::kRelu:
+  switch (act_param.active_type) {
+    case lite_api::ActivationType::kRelu:
 #ifdef __aarch64__
-        asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
-                     : [din0] "+r"(din_ptr0),
-                       [din1] "+r"(din_ptr1),
-                       [din2] "+r"(din_ptr2),
-                       [din3] "+r"(din_ptr3)
-                     : [wr0] "w"(wr0),
-                       [wr1] "w"(wr1),
-                       [wr2] "w"(wr2),
-                       [vzero] "w"(vzero),
-                       [mask] "w"(vmask_rp),
-                       [bias] "w"(wbias),
-                       [out1] "r"(doutr0),
-                       [out2] "r"(doutr1)
-                     : "v0",
-                       "v1",
-                       "v2",
-                       "v3",
-                       "v4",
-                       "v5",
-                       "v6",
-                       "v7",
-                       "v8",
-                       "v9",
-                       "v10",
-                       "v11",
-                       "v12",
-                       "v13",
-                       "v14",
-                       "v15",
-                       "v16",
-                       "v17");
-        break;
+      asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
+                   : [din0] "+r"(din_ptr0),
+                     [din1] "+r"(din_ptr1),
+                     [din2] "+r"(din_ptr2),
+                     [din3] "+r"(din_ptr3)
+                   : [wr0] "w"(wr0),
+                     [wr1] "w"(wr1),
+                     [wr2] "w"(wr2),
+                     [vzero] "w"(vzero),
+                     [mask] "w"(vmask_rp),
+                     [bias] "w"(wbias),
+                     [out1] "r"(doutr0),
+                     [out2] "r"(doutr1)
+                   : "v0",
+                     "v1",
+                     "v2",
+                     "v3",
+                     "v4",
+                     "v5",
+                     "v6",
+                     "v7",
+                     "v8",
+                     "v9",
+                     "v10",
+                     "v11",
+                     "v12",
+                     "v13",
+                     "v14",
+                     "v15",
+                     "v16",
+                     "v17");
+      break;
 #else
-        asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
-                     : [din0] "+r"(din_ptr0),
-                       [din1] "+r"(din_ptr1),
-                       [din2] "+r"(din_ptr2),
-                       [din3] "+r"(din_ptr3)
-                     : [wr0] "w"(wr0),
-                       [wr1] "w"(wr1),
-                       [wr2] "w"(wr2),
-                       [vzero] "w"(vzero),
-                       [mask] "w"(vmask_rp),
-                       [bias] "w"(wbias),
-                       [out1] "r"(doutr0),
-                       [out2] "r"(doutr1)
-                     : "cc",
-                       "memory",
-                       "q6",
-                       "q7",
-                       "q8",
-                       "q9",
-                       "q10",
-                       "q11",
-                       "q12",
-                       "q13",
-                       "q14",
-                       "q15");
-        break;
+      asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
+                   : [din0] "+r"(din_ptr0),
+                     [din1] "+r"(din_ptr1),
+                     [din2] "+r"(din_ptr2),
+                     [din3] "+r"(din_ptr3)
+                   : [wr0] "w"(wr0),
+                     [wr1] "w"(wr1),
+                     [wr2] "w"(wr2),
+                     [vzero] "w"(vzero),
+                     [mask] "w"(vmask_rp),
+                     [bias] "w"(wbias),
+                     [out1] "r"(doutr0),
+                     [out2] "r"(doutr1)
+                   : "cc",
+                     "memory",
+                     "q6",
+                     "q7",
+                     "q8",
+                     "q9",
+                     "q10",
+                     "q11",
+                     "q12",
+                     "q13",
+                     "q14",
+                     "q15");
+      break;
 #endif
-      case lite_api::ActivationType::kRelu6:
+    case lite_api::ActivationType::kRelu6:
 /* 0 <= din <= 6 */
 #ifdef __aarch64__
-        asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6
-                     : [din0] "+r"(din_ptr0),
-                       [din1] "+r"(din_ptr1),
-                       [din2] "+r"(din_ptr2),
-                       [din3] "+r"(din_ptr3)
-                     : [wr0] "w"(wr0),
-                       [wr1] "w"(wr1),
-                       [wr2] "w"(wr2),
-                       [vzero] "w"(vzero),
-                       [mask] "w"(vmask_rp),
-                       [bias] "w"(wbias),
-                       [vsix] "w"(vsix),
-                       [out1] "r"(doutr0),
-                       [out2] "r"(doutr1)
-                     : "v0",
-                       "v1",
-                       "v2",
-                       "v3",
-                       "v4",
-                       "v5",
-                       "v6",
-                       "v7",
-                       "v8",
-                       "v9",
-                       "v10",
-                       "v11",
-                       "v12",
-                       "v13",
-                       "v14",
-                       "v15",
-                       "v16",
-                       "v17");
-        break;
+      asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6
+                   : [din0] "+r"(din_ptr0),
+                     [din1] "+r"(din_ptr1),
+                     [din2] "+r"(din_ptr2),
+                     [din3] "+r"(din_ptr3)
+                   : [wr0] "w"(wr0),
+                     [wr1] "w"(wr1),
+                     [wr2] "w"(wr2),
+                     [vzero] "w"(vzero),
+                     [mask] "w"(vmask_rp),
+                     [bias] "w"(wbias),
+                     [vsix] "w"(vsix),
+                     [out1] "r"(doutr0),
+                     [out2] "r"(doutr1)
+                   : "v0",
+                     "v1",
+                     "v2",
+                     "v3",
+                     "v4",
+                     "v5",
+                     "v6",
+                     "v7",
+                     "v8",
+                     "v9",
+                     "v10",
+                     "v11",
+                     "v12",
+                     "v13",
+                     "v14",
+                     "v15",
+                     "v16",
+                     "v17");
+      break;
 #else
-        asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6
-                     : [din0] "+r"(din_ptr0),
-                       [din1] "+r"(din_ptr1),
-                       [din2] "+r"(din_ptr2),
-                       [din3] "+r"(din_ptr3)
-                     : [wr0] "w"(wr0),
-                       [wr1] "w"(wr1),
-                       [wr2] "w"(wr2),
-                       [vzero] "w"(vzero),
-                       [mask] "w"(vmask_rp),
-                       [bias] "w"(wbias),
-                       [six_ptr] "r"(vsix),
-                       [out1] "r"(doutr0),
-                       [out2] "r"(doutr1)
-                     : "cc",
-                       "memory",
-                       "q6",
-                       "q7",
-                       "q8",
-                       "q9",
-                       "q10",
-                       "q11",
-                       "q12",
-                       "q13",
-                       "q14",
-                       "q15");
-        break;
+      asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6
+                   : [din0] "+r"(din_ptr0),
+                     [din1] "+r"(din_ptr1),
+                     [din2] "+r"(din_ptr2),
+                     [din3] "+r"(din_ptr3)
+                   : [wr0] "w"(wr0),
+                     [wr1] "w"(wr1),
+                     [wr2] "w"(wr2),
+                     [vzero] "w"(vzero),
+                     [mask] "w"(vmask_rp),
+                     [bias] "w"(wbias),
+                     [six_ptr] "r"(vsix),
+                     [out1] "r"(doutr0),
+                     [out2] "r"(doutr1)
+                   : "cc",
+                     "memory",
+                     "q6",
+                     "q7",
+                     "q8",
+                     "q9",
+                     "q10",
+                     "q11",
+                     "q12",
+                     "q13",
+                     "q14",
+                     "q15");
+      break;
 #endif
-      case lite_api::ActivationType::kLeakyRelu:
+    case lite_api::ActivationType::kLeakyRelu:
 /*din = din >= 0 ? din : din * scale*/
 #ifdef __aarch64__
-        asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU
-                     : [din0] "+r"(din_ptr0),
-                       [din1] "+r"(din_ptr1),
-                       [din2] "+r"(din_ptr2),
-                       [din3] "+r"(din_ptr3)
-                     : [wr0] "w"(wr0),
-                       [wr1] "w"(wr1),
-                       [wr2] "w"(wr2),
-                       [vzero] "w"(vzero),
-                       [mask] "w"(vmask_rp),
-                       [bias] "w"(wbias),
-                       [vscale] "w"(vscale),
-                       [out1] "r"(doutr0),
-                       [out2] "r"(doutr1)
-                     : "v0",
-                       "v1",
-                       "v2",
-                       "v3",
-                       "v4",
-                       "v5",
-                       "v6",
-                       "v7",
-                       "v8",
-                       "v9",
-                       "v10",
-                       "v11",
-                       "v12",
-                       "v13",
-                       "v14",
-                       "v15",
-                       "v16",
-                       "v17",
-                       "v18",
-                       "v19",
-                       "v20");
-        break;
-#else
-        asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU
-                     : [din0] "+r"(din_ptr0),
-                       [din1] "+r"(din_ptr1),
-                       [din2] "+r"(din_ptr2),
-                       [din3] "+r"(din_ptr3)
-                     : [wr0] "w"(wr0),
-                       [wr1] "w"(wr1),
-                       [wr2] "w"(wr2),
-                       [vzero] "w"(vzero),
-                       [mask] "w"(vmask_rp),
-                       [bias] "w"(wbias),
-                       [scale_ptr] "r"(vscale),
-                       [out1] "r"(doutr0),
-                       [out2] "r"(doutr1)
-                     : "cc",
-                       "memory",
-                       "q6",
-                       "q7",
-                       "q8",
-                       "q9",
-                       "q10",
-                       "q11",
-                       "q12",
-                       "q13",
-                       "q14",
-                       "q15");
-        break;
-#endif
-      default:
-        LOG(FATAL) << "this act_type: "
-                   << static_cast<int>(act_param.active_type)
-                   << " fuse not support";
-    }
-  } else {
-#ifdef __aarch64__
-    asm volatile(COMPUTE_S_S1 RESULT_S_S1
-                 : [din0] "+r"(din_ptr0),
-                   [din1] "+r"(din_ptr1),
-                   [din2] "+r"(din_ptr2),
-                   [din3] "+r"(din_ptr3)
-                 : [wr0] "w"(wr0),
-                   [wr1] "w"(wr1),
-                   [wr2] "w"(wr2),
-                   [vzero] "w"(vzero),
-                   [mask] "w"(vmask_rp),
-                   [bias] "w"(wbias),
-                   [out1] "r"(doutr0),
-                   [out2] "r"(doutr1)
-                 : "v0",
-                   "v1",
-                   "v2",
-                   "v3",
-                   "v4",
-                   "v5",
-                   "v6",
-                   "v7",
-                   "v8",
-                   "v9",
-                   "v10",
-                   "v11",
-                   "v12",
-                   "v13",
-                   "v14",
-                   "v15",
-                   "v16",
-                   "v17");
+      asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU
+                   : [din0] "+r"(din_ptr0),
+                     [din1] "+r"(din_ptr1),
+                     [din2] "+r"(din_ptr2),
+                     [din3] "+r"(din_ptr3)
+                   : [wr0] "w"(wr0),
+                     [wr1] "w"(wr1),
+                     [wr2] "w"(wr2),
+                     [vzero] "w"(vzero),
+                     [mask] "w"(vmask_rp),
+                     [bias] "w"(wbias),
+                     [vscale] "w"(vscale),
+                     [out1] "r"(doutr0),
+                     [out2] "r"(doutr1)
+                   : "v0",
+                     "v1",
+                     "v2",
+                     "v3",
+                     "v4",
+                     "v5",
+                     "v6",
+                     "v7",
+                     "v8",
+                     "v9",
+                     "v10",
+                     "v11",
+                     "v12",
+                     "v13",
+                     "v14",
+                     "v15",
+                     "v16",
+                     "v17",
+                     "v18",
+                     "v19",
+                     "v20");
+      break;
 #else
-    asm volatile(COMPUTE_S_S1 RESULT_S_S1
-                 : [din0] "+r"(din_ptr0),
-                   [din1] "+r"(din_ptr1),
-                   [din2] "+r"(din_ptr2),
-                   [din3] "+r"(din_ptr3)
-                 : [wr0] "w"(wr0),
-                   [wr1] "w"(wr1),
-                   [wr2] "w"(wr2),
-                   [vzero] "w"(vzero),
-                   [mask] "w"(vmask_rp),
-                   [bias] "w"(wbias),
-                   [out1] "r"(doutr0),
-                   [out2] "r"(doutr1)
-                 : "cc",
-                   "memory",
-                   "q6",
-                   "q7",
-                   "q8",
-                   "q9",
-                   "q10",
-                   "q11",
-                   "q12",
-                   "q13",
-                   "q14",
-                   "q15");
+      asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU
+                   : [din0] "+r"(din_ptr0),
+                     [din1] "+r"(din_ptr1),
+                     [din2] "+r"(din_ptr2),
+                     [din3] "+r"(din_ptr3)
+                   : [wr0] "w"(wr0),
+                     [wr1] "w"(wr1),
+                     [wr2] "w"(wr2),
+                     [vzero] "w"(vzero),
+                     [mask] "w"(vmask_rp),
+                     [bias] "w"(wbias),
+                     [scale_ptr] "r"(vscale),
+                     [out1] "r"(doutr0),
+                     [out2] "r"(doutr1)
+                   : "cc",
+                     "memory",
+                     "q6",
+                     "q7",
+                     "q8",
+                     "q9",
+                     "q10",
+                     "q11",
+                     "q12",
+                     "q13",
+                     "q14",
+                     "q15");
+      break;
 #endif
+    default:
+      LOG(FATAL) << "this act_type: " << static_cast<int>(act_param.active_type)
+                 << " fuse not support";
   }
 }
 /**
@@ -2987,262 +2907,198 @@ void act_switch_3x3s1p0(const float *din_ptr0,
                         int cnt,
                         int remain,
                         const operators::ActivationParam act_param) {
-  bool has_active = act_param.has_active;
-  if (has_active) {
-    float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
-    float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
-
-    switch (act_param.active_type) {
-      case lite_api::ActivationType::kRelu:
-        asm volatile(
-            INIT_S1
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
-            MID_COMPUTE_S1 MID_RESULT_S1_RELU
-            "cmp  %w[remain], #1             \n"
-            "blt 0f                         \n" RIGHT_COMPUTE_S1
-                RIGHT_RESULT_S1_RELU "0: \n"
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1),
-              [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [bias_val] "r"(vbias),
-              [vmask] "r"(vmask),
-              [rmask] "r"(rmask),
-              [vzero] "w"(vzero),
-              [remain] "r"(remain)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25");
-        break;
-      case lite_api::ActivationType::kRelu6:
-        /* 0 <= din <= 6 */
-        asm volatile(
-            INIT_S1
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
-            MID_COMPUTE_S1 MID_RESULT_S1_RELU6
-            "cmp  %w[remain], #1             \n"
-            "blt 0f                         \n" RIGHT_COMPUTE_S1
-                RIGHT_RESULT_S1_RELU6 "0: \n"
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1),
-              [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [vsix] "w"(vsix),
-              [bias_val] "r"(vbias),
-              [vmask] "r"(vmask),
-              [rmask] "r"(rmask),
-              [remain] "r"(remain)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25");
-        break;
-      case lite_api::ActivationType::kLeakyRelu:
-        /*din = din >= 0 ? din : din * scale*/
-        asm volatile(
-            INIT_S1
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
-            MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
-            "cmp  %w[remain], #1             \n"
-            "blt 0f                         \n" RIGHT_COMPUTE_S1
-                RIGHT_RESULT_S1_LEAKY_RELU "0: \n"
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1),
-              [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [vscale] "w"(vscale),
-              [bias_val] "r"(vbias),
-              [vmask] "r"(vmask),
-              [rmask] "r"(rmask),
-              [remain] "r"(remain)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25");
-        break;
-      default:
-        LOG(FATAL) << "this act_type: "
-                   << static_cast<int>(act_param.active_type)
-                   << " fuse not support";
-    }
-  } else {
-    asm volatile(
-        INIT_S1
-        "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-        "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-        "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
-        "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
-        "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-        "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
-        MID_COMPUTE_S1 MID_RESULT_S1
-        "cmp  %w[remain], #1             \n"
-        "blt 0f                         \n" RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
-        "0: \n"
-        : [cnt] "+r"(cnt),
-          [din_ptr0] "+r"(din_ptr0),
-          [din_ptr1] "+r"(din_ptr1),
-          [din_ptr2] "+r"(din_ptr2),
-          [din_ptr3] "+r"(din_ptr3),
-          [din_ptr4] "+r"(din_ptr4),
-          [din_ptr5] "+r"(din_ptr5),
-          [doutr0] "+r"(doutr0),
-          [doutr1] "+r"(doutr1),
-          [doutr2] "+r"(doutr2),
-          [doutr3] "+r"(doutr3)
-        : [w0] "w"(wr0),
-          [w1] "w"(wr1),
-          [w2] "w"(wr2),
-          [bias_val] "r"(vbias),
-          [vmask] "r"(vmask),
-          [rmask] "r"(rmask),
-          [vzero] "w"(vzero),
-          [remain] "r"(remain)
-        : "cc",
-          "memory",
-          "v0",
-          "v1",
-          "v2",
-          "v3",
-          "v4",
-          "v5",
-          "v6",
-          "v7",
-          "v8",
-          "v9",
-          "v10",
-          "v11",
-          "v12",
-          "v13",
-          "v14",
-          "v15",
-          "v16",
-          "v17",
-          "v18",
-          "v19",
-          "v20",
-          "v21",
-          "v22",
-          "v23",
-          "v24",
-          "v25");
+  float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
+  float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
+
+  switch (act_param.active_type) {
+    case lite_api::ActivationType::kRelu:
+      asm volatile(
+          INIT_S1
+          "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+          "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+          "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+          "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+          "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+          "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+          MID_COMPUTE_S1 MID_RESULT_S1_RELU
+          "cmp  %w[remain], #1             \n"
+          "blt 0f                         \n" RIGHT_COMPUTE_S1
+              RIGHT_RESULT_S1_RELU "0: \n"
+          : [cnt] "+r"(cnt),
+            [din_ptr0] "+r"(din_ptr0),
+            [din_ptr1] "+r"(din_ptr1),
+            [din_ptr2] "+r"(din_ptr2),
+            [din_ptr3] "+r"(din_ptr3),
+            [din_ptr4] "+r"(din_ptr4),
+            [din_ptr5] "+r"(din_ptr5),
+            [doutr0] "+r"(doutr0),
+            [doutr1] "+r"(doutr1),
+            [doutr2] "+r"(doutr2),
+            [doutr3] "+r"(doutr3)
+          : [w0] "w"(wr0),
+            [w1] "w"(wr1),
+            [w2] "w"(wr2),
+            [bias_val] "r"(vbias),
+            [vmask] "r"(vmask),
+            [rmask] "r"(rmask),
+            [vzero] "w"(vzero),
+            [remain] "r"(remain)
+          : "cc",
+            "memory",
+            "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23",
+            "v24",
+            "v25");
+      break;
+    case lite_api::ActivationType::kRelu6:
+      /* 0 <= din <= 6 */
+      asm volatile(
+          INIT_S1
+          "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+          "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+          "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+          "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+          "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+          "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+          MID_COMPUTE_S1 MID_RESULT_S1_RELU6
+          "cmp  %w[remain], #1             \n"
+          "blt 0f                         \n" RIGHT_COMPUTE_S1
+              RIGHT_RESULT_S1_RELU6 "0: \n"
+          : [cnt] "+r"(cnt),
+            [din_ptr0] "+r"(din_ptr0),
+            [din_ptr1] "+r"(din_ptr1),
+            [din_ptr2] "+r"(din_ptr2),
+            [din_ptr3] "+r"(din_ptr3),
+            [din_ptr4] "+r"(din_ptr4),
+            [din_ptr5] "+r"(din_ptr5),
+            [doutr0] "+r"(doutr0),
+            [doutr1] "+r"(doutr1),
+            [doutr2] "+r"(doutr2),
+            [doutr3] "+r"(doutr3)
+          : [w0] "w"(wr0),
+            [w1] "w"(wr1),
+            [w2] "w"(wr2),
+            [vsix] "w"(vsix),
+            [bias_val] "r"(vbias),
+            [vmask] "r"(vmask),
+            [rmask] "r"(rmask),
+            [remain] "r"(remain)
+          : "cc",
+            "memory",
+            "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23",
+            "v24",
+            "v25");
+      break;
+    case lite_api::ActivationType::kLeakyRelu:
+      /*din = din >= 0 ? din : din * scale*/
+      asm volatile(
+          INIT_S1
+          "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+          "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+          "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+          "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+          "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+          "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+          MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
+          "cmp  %w[remain], #1             \n"
+          "blt 0f                         \n" RIGHT_COMPUTE_S1
+              RIGHT_RESULT_S1_LEAKY_RELU "0: \n"
+          : [cnt] "+r"(cnt),
+            [din_ptr0] "+r"(din_ptr0),
+            [din_ptr1] "+r"(din_ptr1),
+            [din_ptr2] "+r"(din_ptr2),
+            [din_ptr3] "+r"(din_ptr3),
+            [din_ptr4] "+r"(din_ptr4),
+            [din_ptr5] "+r"(din_ptr5),
+            [doutr0] "+r"(doutr0),
+            [doutr1] "+r"(doutr1),
+            [doutr2] "+r"(doutr2),
+            [doutr3] "+r"(doutr3)
+          : [w0] "w"(wr0),
+            [w1] "w"(wr1),
+            [w2] "w"(wr2),
+            [vscale] "w"(vscale),
+            [bias_val] "r"(vbias),
+            [vmask] "r"(vmask),
+            [rmask] "r"(rmask),
+            [remain] "r"(remain)
+          : "cc",
+            "memory",
+            "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23",
+            "v24",
+            "v25");
+      break;
+    default:
+      LOG(FATAL) << "this act_type: " << static_cast<int>(act_param.active_type)
+                 << " fuse not support";
   }
 }
 #else
@@ -3262,191 +3118,146 @@ void act_switch_3x3s1p0(const float *din_ptr0,
                         int cnt,
                         int remain,
                         const operators::ActivationParam act_param) {
-  bool has_active = act_param.has_active;
-  if (has_active) {
-    float tmp = act_param.Relu_clipped_coef;
-    float ss = act_param.Leaky_relu_alpha;
-    float vsix[4] = {tmp, tmp, tmp, tmp};
-    float vscale[4] = {ss, ss, ss, ss};
-
-    switch (act_param.active_type) {
-      case lite_api::ActivationType::kRelu:
-        asm volatile(INIT_S1
-                     "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
-                     "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
-                     "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
-                     "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
-                     "vext.32  q6, q8, q9, #1     @ 0012\n"
-                     "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
-                         MID_RESULT_S1_RELU
-                     "cmp  %[remain], #1             \n"
-                     "blt 0f                         \n" RIGHT_COMPUTE_S1
-                         RIGHT_RESULT_S1_RELU "0:                         \n"
-                     : [dout_ptr1] "+r"(doutr0),
-                       [dout_ptr2] "+r"(doutr1),
-                       [din0_ptr] "+r"(din_ptr0),
-                       [din1_ptr] "+r"(din_ptr1),
-                       [din2_ptr] "+r"(din_ptr2),
-                       [din3_ptr] "+r"(din_ptr3),
-                       [cnt] "+r"(cnt),
-                       [rmask] "+r"(rmask_ptr),
-                       [vmask] "+r"(vmask_ptr)
-                     : [wr0] "w"(wr0),
-                       [wr1] "w"(wr1),
-                       [wr2] "w"(wr2),
-                       [bias_val] "r"(bias_val),
-                       [vzero] "w"(vzero),
-                       [remain] "r"(remain)
-                     : "cc",
-                       "memory",
-                       "q4",
-                       "q5",
-                       "q6",
-                       "q7",
-                       "q8",
-                       "q9",
-                       "q10",
-                       "q11",
-                       "q12",
-                       "q13",
-                       "q14",
-                       "q15");
-        break;
-      case lite_api::ActivationType::kRelu6:
-        /* 0 <= din <= 6 */
-        asm volatile(INIT_S1
-                     "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
-                     "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
-                     "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
-                     "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
-                     "vext.32  q6, q8, q9, #1     @ 0012\n"
-                     "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
-                         MID_RESULT_S1_RELU6
-                     "cmp  %[remain], #1             \n"
-                     "blt 0f                         \n" RIGHT_COMPUTE_S1
-                         RIGHT_RESULT_S1_RELU6 "0:                         \n"
-                     : [dout_ptr1] "+r"(doutr0),
-                       [dout_ptr2] "+r"(doutr1),
-                       [din0_ptr] "+r"(din_ptr0),
-                       [din1_ptr] "+r"(din_ptr1),
-                       [din2_ptr] "+r"(din_ptr2),
-                       [din3_ptr] "+r"(din_ptr3),
-                       [cnt] "+r"(cnt),
-                       [rmask] "+r"(rmask_ptr),
-                       [vmask] "+r"(vmask_ptr)
-                     : [wr0] "w"(wr0),
-                       [wr1] "w"(wr1),
-                       [wr2] "w"(wr2),
-                       [six_ptr] "r"(vsix),
-                       [bias_val] "r"(bias_val),
-                       [vzero] "w"(vzero),
-                       [remain] "r"(remain)
-                     : "cc",
-                       "memory",
-                       "q4",
-                       "q5",
-                       "q6",
-                       "q7",
-                       "q8",
-                       "q9",
-                       "q10",
-                       "q11",
-                       "q12",
-                       "q13",
-                       "q14",
-                       "q15");
-        break;
-      case lite_api::ActivationType::kLeakyRelu:
-        /*din = din >= 0 ? din : din * scale*/
-        asm volatile(INIT_S1
-                     "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
-                     "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
-                     "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
-                     "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
-                     "vext.32  q6, q8, q9, #1     @ 0012\n"
-                     "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
-                         MID_RESULT_S1_LEAKY_RELU
-                     "cmp  %[remain], #1             \n"
-                     "blt 0f                         \n" RIGHT_COMPUTE_S1
-                         RIGHT_RESULT_S1_LEAKY_RELU
-                     "0:                         \n"
-                     : [dout_ptr1] "+r"(doutr0),
-                       [dout_ptr2] "+r"(doutr1),
-                       [din0_ptr] "+r"(din_ptr0),
-                       [din1_ptr] "+r"(din_ptr1),
-                       [din2_ptr] "+r"(din_ptr2),
-                       [din3_ptr] "+r"(din_ptr3),
-                       [cnt] "+r"(cnt),
-                       [rmask] "+r"(rmask_ptr),
-                       [vmask] "+r"(vmask_ptr)
-                     : [wr0] "w"(wr0),
-                       [wr1] "w"(wr1),
-                       [wr2] "w"(wr2),
-                       [scale_ptr] "r"(vscale),
-                       [bias_val] "r"(bias_val),
-                       [vzero] "w"(vzero),
-                       [remain] "r"(remain)
-                     : "cc",
-                       "memory",
-                       "q4",
-                       "q5",
-                       "q6",
-                       "q7",
-                       "q8",
-                       "q9",
-                       "q10",
-                       "q11",
-                       "q12",
-                       "q13",
-                       "q14",
-                       "q15");
-        break;
-      default:
-        LOG(FATAL) << "this act_type: "
-                   << static_cast<int>(act_param.active_type)
-                   << " fuse not support";
-    }
-  } else {
-    asm volatile(
-        INIT_S1
-        "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
-        "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
-        "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
-        "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
-        "vext.32  q6, q8, q9, #1     @ 0012\n"
-        "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1 MID_RESULT_S1
-        "cmp  %[remain], #1             \n"
-        "blt 0f                         \n" RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
-        "0:                         \n"
-        : [dout_ptr1] "+r"(doutr0),
-          [dout_ptr2] "+r"(doutr1),
-          [din0_ptr] "+r"(din_ptr0),
-          [din1_ptr] "+r"(din_ptr1),
-          [din2_ptr] "+r"(din_ptr2),
-          [din3_ptr] "+r"(din_ptr3),
-          [cnt] "+r"(cnt),
-          [rmask] "+r"(rmask_ptr),
-          [vmask] "+r"(vmask_ptr)
-        : [wr0] "w"(wr0),
-          [wr1] "w"(wr1),
-          [wr2] "w"(wr2),
-          [bias_val] "r"(bias_val),
-          [vzero] "w"(vzero),
-          [remain] "r"(remain)
-        : "cc",
-          "memory",
-          "q4",
-          "q5",
-          "q6",
-          "q7",
-          "q8",
-          "q9",
-          "q10",
-          "q11",
-          "q12",
-          "q13",
-          "q14",
-          "q15");
+  float tmp = act_param.Relu_clipped_coef;
+  float ss = act_param.Leaky_relu_alpha;
+  float vsix[4] = {tmp, tmp, tmp, tmp};
+  float vscale[4] = {ss, ss, ss, ss};
+
+  switch (act_param.active_type) {
+    case lite_api::ActivationType::kRelu:
+      asm volatile(INIT_S1
+                   "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+                   "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+                   "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+                   "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+                   "vext.32  q6, q8, q9, #1     @ 0012\n"
+                   "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
+                       MID_RESULT_S1_RELU
+                   "cmp  %[remain], #1             \n"
+                   "blt 0f                         \n" RIGHT_COMPUTE_S1
+                       RIGHT_RESULT_S1_RELU "0:                         \n"
+                   : [dout_ptr1] "+r"(doutr0),
+                     [dout_ptr2] "+r"(doutr1),
+                     [din0_ptr] "+r"(din_ptr0),
+                     [din1_ptr] "+r"(din_ptr1),
+                     [din2_ptr] "+r"(din_ptr2),
+                     [din3_ptr] "+r"(din_ptr3),
+                     [cnt] "+r"(cnt),
+                     [rmask] "+r"(rmask_ptr),
+                     [vmask] "+r"(vmask_ptr)
+                   : [wr0] "w"(wr0),
+                     [wr1] "w"(wr1),
+                     [wr2] "w"(wr2),
+                     [bias_val] "r"(bias_val),
+                     [vzero] "w"(vzero),
+                     [remain] "r"(remain)
+                   : "cc",
+                     "memory",
+                     "q4",
+                     "q5",
+                     "q6",
+                     "q7",
+                     "q8",
+                     "q9",
+                     "q10",
+                     "q11",
+                     "q12",
+                     "q13",
+                     "q14",
+                     "q15");
+      break;
+    case lite_api::ActivationType::kRelu6:
+      /* 0 <= din <= 6 */
+      asm volatile(INIT_S1
+                   "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+                   "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+                   "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+                   "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+                   "vext.32  q6, q8, q9, #1     @ 0012\n"
+                   "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
+                       MID_RESULT_S1_RELU6
+                   "cmp  %[remain], #1             \n"
+                   "blt 0f                         \n" RIGHT_COMPUTE_S1
+                       RIGHT_RESULT_S1_RELU6 "0:                         \n"
+                   : [dout_ptr1] "+r"(doutr0),
+                     [dout_ptr2] "+r"(doutr1),
+                     [din0_ptr] "+r"(din_ptr0),
+                     [din1_ptr] "+r"(din_ptr1),
+                     [din2_ptr] "+r"(din_ptr2),
+                     [din3_ptr] "+r"(din_ptr3),
+                     [cnt] "+r"(cnt),
+                     [rmask] "+r"(rmask_ptr),
+                     [vmask] "+r"(vmask_ptr)
+                   : [wr0] "w"(wr0),
+                     [wr1] "w"(wr1),
+                     [wr2] "w"(wr2),
+                     [six_ptr] "r"(vsix),
+                     [bias_val] "r"(bias_val),
+                     [vzero] "w"(vzero),
+                     [remain] "r"(remain)
+                   : "cc",
+                     "memory",
+                     "q4",
+                     "q5",
+                     "q6",
+                     "q7",
+                     "q8",
+                     "q9",
+                     "q10",
+                     "q11",
+                     "q12",
+                     "q13",
+                     "q14",
+                     "q15");
+      break;
+    case lite_api::ActivationType::kLeakyRelu:
+      /*din = din >= 0 ? din : din * scale*/
+      asm volatile(INIT_S1
+                   "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+                   "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+                   "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+                   "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+                   "vext.32  q6, q8, q9, #1     @ 0012\n"
+                   "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
+                       MID_RESULT_S1_LEAKY_RELU
+                   "cmp  %[remain], #1             \n"
+                   "blt 0f                         \n" RIGHT_COMPUTE_S1
+                       RIGHT_RESULT_S1_LEAKY_RELU
+                   "0:                         \n"
+                   : [dout_ptr1] "+r"(doutr0),
+                     [dout_ptr2] "+r"(doutr1),
+                     [din0_ptr] "+r"(din_ptr0),
+                     [din1_ptr] "+r"(din_ptr1),
+                     [din2_ptr] "+r"(din_ptr2),
+                     [din3_ptr] "+r"(din_ptr3),
+                     [cnt] "+r"(cnt),
+                     [rmask] "+r"(rmask_ptr),
+                     [vmask] "+r"(vmask_ptr)
+                   : [wr0] "w"(wr0),
+                     [wr1] "w"(wr1),
+                     [wr2] "w"(wr2),
+                     [scale_ptr] "r"(vscale),
+                     [bias_val] "r"(bias_val),
+                     [vzero] "w"(vzero),
+                     [remain] "r"(remain)
+                   : "cc",
+                     "memory",
+                     "q4",
+                     "q5",
+                     "q6",
+                     "q7",
+                     "q8",
+                     "q9",
+                     "q10",
+                     "q11",
+                     "q12",
+                     "q13",
+                     "q14",
+                     "q15");
+      break;
+    default:
+      LOG(FATAL) << "this act_type: " << static_cast<int>(act_param.active_type)
+                 << " fuse not support";
   }
 }
 #endif
@@ -3694,287 +3505,220 @@ void act_switch_3x3s1p0_s(const float *din_ptr0,
                           unsigned int *vmask_ptr,
                           float bias_val,
                           const operators::ActivationParam act_param) {
-  bool has_active = act_param.has_active;
-  if (has_active) {
 #ifdef __aarch64__
-    float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
-    float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
+  float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
+  float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
 #else
-    float tmp = act_param.Relu_clipped_coef;
-    float ss = act_param.Leaky_relu_alpha;
-    float vsix[4] = {tmp, tmp, tmp, tmp};
-    float vscale[4] = {ss, ss, ss, ss};
+  float tmp = act_param.Relu_clipped_coef;
+  float ss = act_param.Leaky_relu_alpha;
+  float vsix[4] = {tmp, tmp, tmp, tmp};
+  float vscale[4] = {ss, ss, ss, ss};
 #endif
-    switch (act_param.active_type) {
-      case lite_api::ActivationType::kRelu:
+  switch (act_param.active_type) {
+    case lite_api::ActivationType::kRelu:
 #ifdef __aarch64__
-        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
-                     : [din0] "+r"(din_ptr0),
-                       [din1] "+r"(din_ptr1),
-                       [din2] "+r"(din_ptr2),
-                       [din3] "+r"(din_ptr3)
-                     : [wr0] "w"(wr0),
-                       [wr1] "w"(wr1),
-                       [wr2] "w"(wr2),
-                       [vbias] "w"(wbias),
-                       [mask1] "w"(vmask_rp1),
-                       [mask2] "w"(vmask_rp2),
-                       [vzero] "w"(vzero),
-                       [out1] "r"(doutr0),
-                       [out2] "r"(doutr1)
-                     : "cc",
-                       "memory",
-                       "v0",
-                       "v1",
-                       "v2",
-                       "v3",
-                       "v4",
-                       "v5",
-                       "v6",
-                       "v7",
-                       "v8",
-                       "v9",
-                       "v10",
-                       "v11",
-                       "v12",
-                       "v13",
-                       "v14",
-                       "v15");
-        break;
+      asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
+                   : [din0] "+r"(din_ptr0),
+                     [din1] "+r"(din_ptr1),
+                     [din2] "+r"(din_ptr2),
+                     [din3] "+r"(din_ptr3)
+                   : [wr0] "w"(wr0),
+                     [wr1] "w"(wr1),
+                     [wr2] "w"(wr2),
+                     [vbias] "w"(wbias),
+                     [mask1] "w"(vmask_rp1),
+                     [mask2] "w"(vmask_rp2),
+                     [vzero] "w"(vzero),
+                     [out1] "r"(doutr0),
+                     [out2] "r"(doutr1)
+                   : "cc",
+                     "memory",
+                     "v0",
+                     "v1",
+                     "v2",
+                     "v3",
+                     "v4",
+                     "v5",
+                     "v6",
+                     "v7",
+                     "v8",
+                     "v9",
+                     "v10",
+                     "v11",
+                     "v12",
+                     "v13",
+                     "v14",
+                     "v15");
+      break;
 #else
-        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
-                     : [din0] "+r"(din_ptr0),
-                       [din1] "+r"(din_ptr1),
-                       [din2] "+r"(din_ptr2),
-                       [din3] "+r"(din_ptr3),
-                       [vmask] "+r"(vmask_ptr)
-                     : [wr0] "w"(wr0),
-                       [wr1] "w"(wr1),
-                       [wr2] "w"(wr2),
-                       [vzero] "w"(vzero),
-                       [bias_val] "r"(bias_val),
-                       [out1] "r"(doutr0),
-                       [out2] "r"(doutr1)
-                     : "cc",
-                       "memory",
-                       "q4",
-                       "q5",
-                       "q6",
-                       "q7",
-                       "q8",
-                       "q9",
-                       "q10",
-                       "q11",
-                       "q12",
-                       "q13",
-                       "q14",
-                       "q15");
-        break;
+      asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
+                   : [din0] "+r"(din_ptr0),
+                     [din1] "+r"(din_ptr1),
+                     [din2] "+r"(din_ptr2),
+                     [din3] "+r"(din_ptr3),
+                     [vmask] "+r"(vmask_ptr)
+                   : [wr0] "w"(wr0),
+                     [wr1] "w"(wr1),
+                     [wr2] "w"(wr2),
+                     [vzero] "w"(vzero),
+                     [bias_val] "r"(bias_val),
+                     [out1] "r"(doutr0),
+                     [out2] "r"(doutr1)
+                   : "cc",
+                     "memory",
+                     "q4",
+                     "q5",
+                     "q6",
+                     "q7",
+                     "q8",
+                     "q9",
+                     "q10",
+                     "q11",
+                     "q12",
+                     "q13",
+                     "q14",
+                     "q15");
+      break;
 #endif
-      case lite_api::ActivationType::kRelu6:
+    case lite_api::ActivationType::kRelu6:
 /* 0 <= din <= 6 */
 #ifdef __aarch64__
-        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6
-                     : [din0] "+r"(din_ptr0),
-                       [din1] "+r"(din_ptr1),
-                       [din2] "+r"(din_ptr2),
-                       [din3] "+r"(din_ptr3)
-                     : [wr0] "w"(wr0),
-                       [wr1] "w"(wr1),
-                       [wr2] "w"(wr2),
-                       [vbias] "w"(wbias),
-                       [mask1] "w"(vmask_rp1),
-                       [mask2] "w"(vmask_rp2),
-                       [vzero] "w"(vzero),
-                       [vsix] "w"(vsix),
-                       [out1] "r"(doutr0),
-                       [out2] "r"(doutr1)
-                     : "cc",
-                       "memory",
-                       "v0",
-                       "v1",
-                       "v2",
-                       "v3",
-                       "v4",
-                       "v5",
-                       "v6",
-                       "v7",
-                       "v8",
-                       "v9",
-                       "v10",
-                       "v11",
-                       "v12",
-                       "v13",
-                       "v14",
-                       "v15");
-        break;
+      asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6
+                   : [din0] "+r"(din_ptr0),
+                     [din1] "+r"(din_ptr1),
+                     [din2] "+r"(din_ptr2),
+                     [din3] "+r"(din_ptr3)
+                   : [wr0] "w"(wr0),
+                     [wr1] "w"(wr1),
+                     [wr2] "w"(wr2),
+                     [vbias] "w"(wbias),
+                     [mask1] "w"(vmask_rp1),
+                     [mask2] "w"(vmask_rp2),
+                     [vzero] "w"(vzero),
+                     [vsix] "w"(vsix),
+                     [out1] "r"(doutr0),
+                     [out2] "r"(doutr1)
+                   : "cc",
+                     "memory",
+                     "v0",
+                     "v1",
+                     "v2",
+                     "v3",
+                     "v4",
+                     "v5",
+                     "v6",
+                     "v7",
+                     "v8",
+                     "v9",
+                     "v10",
+                     "v11",
+                     "v12",
+                     "v13",
+                     "v14",
+                     "v15");
+      break;
 #else
-        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6
-                     : [din0] "+r"(din_ptr0),
-                       [din1] "+r"(din_ptr1),
-                       [din2] "+r"(din_ptr2),
-                       [din3] "+r"(din_ptr3),
-                       [vmask] "+r"(vmask_ptr)
-                     : [wr0] "w"(wr0),
-                       [wr1] "w"(wr1),
-                       [wr2] "w"(wr2),
-                       [vzero] "w"(vzero),
-                       [six_ptr] "r"(vsix),
-                       [bias_val] "r"(bias_val),
-                       [out1] "r"(doutr0),
-                       [out2] "r"(doutr1)
-                     : "cc",
-                       "memory",
-                       "q4",
-                       "q5",
-                       "q6",
-                       "q7",
-                       "q8",
-                       "q9",
-                       "q10",
-                       "q11",
-                       "q12",
-                       "q13",
-                       "q14",
-                       "q15");
-        break;
+      asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6
+                   : [din0] "+r"(din_ptr0),
+                     [din1] "+r"(din_ptr1),
+                     [din2] "+r"(din_ptr2),
+                     [din3] "+r"(din_ptr3),
+                     [vmask] "+r"(vmask_ptr)
+                   : [wr0] "w"(wr0),
+                     [wr1] "w"(wr1),
+                     [wr2] "w"(wr2),
+                     [vzero] "w"(vzero),
+                     [six_ptr] "r"(vsix),
+                     [bias_val] "r"(bias_val),
+                     [out1] "r"(doutr0),
+                     [out2] "r"(doutr1)
+                   : "cc",
+                     "memory",
+                     "q4",
+                     "q5",
+                     "q6",
+                     "q7",
+                     "q8",
+                     "q9",
+                     "q10",
+                     "q11",
+                     "q12",
+                     "q13",
+                     "q14",
+                     "q15");
+      break;
 #endif
-      case lite_api::ActivationType::kLeakyRelu:
+    case lite_api::ActivationType::kLeakyRelu:
 /*din = din >= 0 ? din : din * scale*/
 #ifdef __aarch64__
-        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU
-                     : [din0] "+r"(din_ptr0),
-                       [din1] "+r"(din_ptr1),
-                       [din2] "+r"(din_ptr2),
-                       [din3] "+r"(din_ptr3)
-                     : [wr0] "w"(wr0),
-                       [wr1] "w"(wr1),
-                       [wr2] "w"(wr2),
-                       [vbias] "w"(wbias),
-                       [mask1] "w"(vmask_rp1),
-                       [mask2] "w"(vmask_rp2),
-                       [vzero] "w"(vzero),
-                       [vscale] "w"(vscale),
-                       [out1] "r"(doutr0),
-                       [out2] "r"(doutr1)
-                     : "cc",
-                       "memory",
-                       "v0",
-                       "v1",
-                       "v2",
-                       "v3",
-                       "v4",
-                       "v5",
-                       "v6",
-                       "v7",
-                       "v8",
-                       "v9",
-                       "v10",
-                       "v11",
-                       "v12",
-                       "v13",
-                       "v14",
-                       "v15");
-        break;
-#else
-        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU
-                     : [din0] "+r"(din_ptr0),
-                       [din1] "+r"(din_ptr1),
-                       [din2] "+r"(din_ptr2),
-                       [din3] "+r"(din_ptr3),
-                       [vmask] "+r"(vmask_ptr)
-                     : [wr0] "w"(wr0),
-                       [wr1] "w"(wr1),
-                       [wr2] "w"(wr2),
-                       [vzero] "w"(vzero),
-                       [scale_ptr] "r"(vscale),
-                       [bias_val] "r"(bias_val),
-                       [out1] "r"(doutr0),
-                       [out2] "r"(doutr1)
-                     : "cc",
-                       "memory",
-                       "q4",
-                       "q5",
-                       "q6",
-                       "q7",
-                       "q8",
-                       "q9",
-                       "q10",
-                       "q11",
-                       "q12",
-                       "q13",
-                       "q14",
-                       "q15");
-        break;
-#endif
-      default:
-        LOG(FATAL) << "this act_type: "
-                   << static_cast<int>(act_param.active_type)
-                   << " fuse not support";
-    }
-  } else {
-#ifdef __aarch64__
-    asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
-                 : [din0] "+r"(din_ptr0),
-                   [din1] "+r"(din_ptr1),
-                   [din2] "+r"(din_ptr2),
-                   [din3] "+r"(din_ptr3)
-                 : [wr0] "w"(wr0),
-                   [wr1] "w"(wr1),
-                   [wr2] "w"(wr2),
-                   [vbias] "w"(wbias),
-                   [mask1] "w"(vmask_rp1),
-                   [mask2] "w"(vmask_rp2),
-                   [vzero] "w"(vzero),
-                   [out1] "r"(doutr0),
-                   [out2] "r"(doutr1)
-                 : "cc",
-                   "memory",
-                   "v0",
-                   "v1",
-                   "v2",
-                   "v3",
-                   "v4",
-                   "v5",
-                   "v6",
-                   "v7",
-                   "v8",
-                   "v9",
-                   "v10",
-                   "v11",
-                   "v12",
-                   "v13",
-                   "v14",
-                   "v15");
+      asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU
+                   : [din0] "+r"(din_ptr0),
+                     [din1] "+r"(din_ptr1),
+                     [din2] "+r"(din_ptr2),
+                     [din3] "+r"(din_ptr3)
+                   : [wr0] "w"(wr0),
+                     [wr1] "w"(wr1),
+                     [wr2] "w"(wr2),
+                     [vbias] "w"(wbias),
+                     [mask1] "w"(vmask_rp1),
+                     [mask2] "w"(vmask_rp2),
+                     [vzero] "w"(vzero),
+                     [vscale] "w"(vscale),
+                     [out1] "r"(doutr0),
+                     [out2] "r"(doutr1)
+                   : "cc",
+                     "memory",
+                     "v0",
+                     "v1",
+                     "v2",
+                     "v3",
+                     "v4",
+                     "v5",
+                     "v6",
+                     "v7",
+                     "v8",
+                     "v9",
+                     "v10",
+                     "v11",
+                     "v12",
+                     "v13",
+                     "v14",
+                     "v15");
+      break;
 #else
-    asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
-                 : [din0] "+r"(din_ptr0),
-                   [din1] "+r"(din_ptr1),
-                   [din2] "+r"(din_ptr2),
-                   [din3] "+r"(din_ptr3),
-                   [vmask] "+r"(vmask_ptr)
-                 : [wr0] "w"(wr0),
-                   [wr1] "w"(wr1),
-                   [wr2] "w"(wr2),
-                   [vzero] "w"(vzero),
-                   [bias_val] "r"(bias_val),
-                   [out1] "r"(doutr0),
-                   [out2] "r"(doutr1)
-                 : "cc",
-                   "memory",
-                   "q4",
-                   "q5",
-                   "q6",
-                   "q7",
-                   "q8",
-                   "q9",
-                   "q10",
-                   "q11",
-                   "q12",
-                   "q13",
-                   "q14",
-                   "q15");
+      asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU
+                   : [din0] "+r"(din_ptr0),
+                     [din1] "+r"(din_ptr1),
+                     [din2] "+r"(din_ptr2),
+                     [din3] "+r"(din_ptr3),
+                     [vmask] "+r"(vmask_ptr)
+                   : [wr0] "w"(wr0),
+                     [wr1] "w"(wr1),
+                     [wr2] "w"(wr2),
+                     [vzero] "w"(vzero),
+                     [scale_ptr] "r"(vscale),
+                     [bias_val] "r"(bias_val),
+                     [out1] "r"(doutr0),
+                     [out2] "r"(doutr1)
+                   : "cc",
+                     "memory",
+                     "q4",
+                     "q5",
+                     "q6",
+                     "q7",
+                     "q8",
+                     "q9",
+                     "q10",
+                     "q11",
+                     "q12",
+                     "q13",
+                     "q14",
+                     "q15");
+      break;
 #endif
+    default:
+      LOG(FATAL) << "this act_type: " << static_cast<int>(act_param.active_type)
+                 << " fuse not support";
   }
 }
 /**
diff --git a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32_relu.cc b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32_relu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c9dd4d2fd1e30d9b82a8db64a4872095af3f9768
--- /dev/null
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32_relu.cc
@@ -0,0 +1,2418 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include "lite/backends/arm/math/conv_depthwise.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+#ifdef __aarch64__
+#define INIT_S1                                                   \
+  "PRFM PLDL1KEEP, [%[din_ptr0]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr1]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr2]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr3]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr4]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr5]] \n"                              \
+  "movi   v21.4s, #0x0\n" /* out0 = 0 */                          \
+                                                                  \
+  "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
+  "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
+  "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
+  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
+                                                                  \
+  "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/       \
+  "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/       \
+  "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/       \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/       \
+                                                                  \
+  "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/ \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
+
+#define LEFT_COMPUTE_S1                                                   \
+  "ext  v16.16b, %[vzero].16b, v0.16b, #12 \n"           /* v16 = 00123*/ \
+  "ext  v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ /* r0 */         \
+  "fmla v12.4s,  v0.4s,  %[w0].s[1]\n" /* outr00 += din0_0123 * w0[1]*/   \
+                                                                          \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/         \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/         \
+  "sub %[din_ptr0], %[din_ptr0], #4 \n"   /* din_ptr0-- */                \
+  "sub %[din_ptr1], %[din_ptr1], #4 \n"   /* din_ptr0-- */                \
+                                                                          \
+  "fmla v12.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din0_0012 * w0[0]*/ \
+                                                                          \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/           \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/           \
+  "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */                  \
+  "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */                  \
+                                                                          \
+  "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_1234 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, %[vzero].16b, v2.16b, #12 \n"           /* v16 = 00123*/ \
+  "ext  v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ /* r1 */         \
+  "fmla v13.4s ,  v2.4s,  %[w0].s[1]\n" /* outr00 += din1_0123 * w0[1]*/  \
+  "fmla v12.4s ,  v2.4s,  %[w1].s[1]\n" /* outr00 += din1_0123 * w1[1]*/  \
+  "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */                  \
+  "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */                  \
+                                                                          \
+  "fmla v13.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+  "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ext v17.16b, v4.16b, v5.16b, #4 \n"         /* v16=1234 */             \
+  "ext  v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/           \
+                                                                          \
+  /* r2 */                                                                \
+  "fmla v14.4s ,  v4.4s,  %[w0].s[1]\n" /* outr00 += din2_0123 * w0[1]*/  \
+  "fmla v13.4s ,  v4.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
+  "fmla v12.4s ,  v4.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/  \
+                                                                          \
+  "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/               \
+  "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v14.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ext  v16.16b, %[vzero].16b, v6.16b, #12 \n"           /* v16 = 00123*/ \
+  "ext  v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ /* r3 */         \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[1]\n" /*outr00 += din2_0123 * w0[1]*/   \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/  \
+                                                                          \
+  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/               \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ext  v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/           \
+  "ext  v17.16b, v8.16b, v9.16b, #4 \n"        /* v16 = 1234 */
+
+#define LEFT_RESULT_S1                                                      \
+  /* r4 */                                                                  \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/    \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/    \
+                                                                            \
+  "st1 {v12.4s}, [%[doutr0]], #16 \n"    /* vst1q_f32() */                  \
+  "st1 {v13.4s}, [%[doutr1]], #16 \n"    /* vst1q_f32() */                  \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/            \
+                                                                            \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/   \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/   \
+                                                                            \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/             \
+  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
+                                                                            \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/   \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/   \
+                                                                            \
+  "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n"            /* v16 = 00123*/ \
+  "ext  v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ /* r5 */         \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/   \
+                                                                            \
+  "st1 {v14.4s}, [%[doutr2]], #16 \n"    /* vst1q_f32() */                  \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/            \
+                                                                            \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/   \
+                                                                            \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/             \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
+                                                                            \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/   \
+                                                                            \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                     \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                    \
+                                                                            \
+  "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */                     \
+  "cmp  %w[cnt], #1                \n"                                      \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
+                                                                            \
+  "blt 3f                         \n"
+
+#define MID_COMPUTE_S1                                                    \
+  "1:                             \n"   /* r0 */                          \
+  "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v2.16b, v3.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */         \
+  "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v4.16b, v5.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */         \
+  "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/                   \
+  "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
+
+#define MID_RESULT_S1                                                      \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n"                  /* v16 = 1234*/ \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */        \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                           \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                    \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                   \
+                                                                           \
+  "subs %w[cnt], %w[cnt], #1 \n"                                           \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"                                  \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "bne 1b \n"
+
+#define RIGHT_COMPUTE_S1                                                  \
+  "3:                             \n"                                     \
+  "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"                           \
+  "ld1 {v22.4s}, [%[doutr0]]         \n"                                  \
+  "ld1 {v23.4s}, [%[doutr1]]         \n"                                  \
+  "ld1 {v24.4s}, [%[doutr2]]         \n"                                  \
+  "ld1 {v25.4s}, [%[doutr3]]         \n"                                  \
+                                                                          \
+  "bif v0.16b, %[vzero].16b, v18.16b \n"                                  \
+  "bif v1.16b, %[vzero].16b, v19.16b \n"                                  \
+  "bif v2.16b, %[vzero].16b, v18.16b \n"                                  \
+  "bif v3.16b, %[vzero].16b, v19.16b \n"                                  \
+                                                                          \
+  "bif v4.16b, %[vzero].16b, v18.16b \n"                                  \
+  "bif v5.16b, %[vzero].16b, v19.16b \n"                                  \
+  "bif v6.16b, %[vzero].16b, v18.16b \n"                                  \
+  "bif v7.16b, %[vzero].16b, v19.16b \n"                                  \
+                                                                          \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ /* r0 */         \
+  "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                          \
+  "bif v8.16b, %[vzero].16b, v18.16b \n"                                  \
+  "bif v9.16b, %[vzero].16b, v19.16b \n"                                  \
+  "bif v10.16b, %[vzero].16b, v18.16b \n"                                 \
+  "bif v11.16b, %[vzero].16b, v19.16b \n"                                 \
+                                                                          \
+  "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                          \
+  "ld1 {v18.4s}, [%[rmask]]         \n"                                   \
+                                                                          \
+  "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v2.16b, v3.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */         \
+  "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v4.16b, v5.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */         \
+  "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/                   \
+  "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
+
+#define RIGHT_RESULT_S1                                                    \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "bif v12.16b, v22.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "bif v13.16b, v23.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n"                  /* v16 = 1234*/ \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */        \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                           \
+  "bif v14.16b, v24.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "bif v15.16b, v25.16b, v18.16b \n"                                       \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"
+
+#define LEFT_RESULT_S1_RELU                                               \
+  /* r4 */                                                                \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/  \
+                                                                          \
+  "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/                          \
+  "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/                          \
+                                                                          \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */                   \
+  "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */                   \
+                                                                          \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/          \
+  "ext  v17.16b, v10.16b, v11.16b, #4 \n"       /* v16 = 1234 */          \
+  "ld1 {v12.4s}, [%[bias_val]]      \n"         /*vdupq_n_f32(bias_val)*/ \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ /* r5*/ \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/                          \
+                                                                          \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+                                                                          \
+  "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */                   \
+                                                                          \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/              \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+                                                                          \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
+                                                                          \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                   \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                  \
+                                                                          \
+  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                          \
+                                                                          \
+  "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */                   \
+  "cmp  %w[cnt], #1                \n"                                    \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
+  "blt 3f                         \n"
+
+#define MID_RESULT_S1_RELU                                                 \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
+  "fmax v12.4s, v12.4s, %[vzero].4s \n"  /*relu*/                          \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+                                                                           \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
+  "fmax v13.4s, v13.4s, %[vzero].4s \n"  /*relu*/                          \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
+                                                                           \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                  \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                 \
+                                                                           \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n"  /* outr00 += din0_0123 * w0[0]*/ \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+  "fmax v14.4s, v14.4s, %[vzero].4s \n"   /*relu*/                         \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                    \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                   \
+                                                                           \
+  "subs %w[cnt], %w[cnt], #1 \n"                                           \
+                                                                           \
+  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                           \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"                                  \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "bne 1b \n"
+
+#define RIGHT_RESULT_S1_RELU                                               \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/                           \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "bif v12.16b, v22.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+  "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/                           \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "bif v13.16b, v23.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                  \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                 \
+                                                                           \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n" /* r3 */                         \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n"  /* outr00 += din0_0123 * w0[0]*/ \
+                                                                           \
+  "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/                           \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "bif v14.16b, v24.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                           \
+                                                                           \
+  "bif v15.16b, v25.16b, v18.16b \n"                                       \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"
+
+#define COMPUTE_S_S1                       \
+  "prfm pldl1keep, [%[din0]]\n"            \
+  "prfm pldl1keep, [%[din1]]\n"            \
+  "prfm pldl1keep, [%[din2]]\n"            \
+  "prfm pldl1keep, [%[din3]]\n"            \
+                                           \
+  "ld1 {v0.4s}, [%[din0]], #16\n"          \
+  "ld1 {v1.4s}, [%[din1]], #16\n"          \
+  "ld1 {v2.4s}, [%[din2]], #16\n"          \
+  "ld1 {v3.4s}, [%[din3]], #16\n"          \
+                                           \
+  "bif v0.16b, %[zero].16b, %[mask].16b\n" \
+  "bif v1.16b, %[zero].16b, %[mask].16b\n" \
+  "bif v2.16b, %[zero].16b, %[mask].16b\n" \
+  "bif v3.16b, %[zero].16b, %[mask].16b\n" \
+                                           \
+  "ext v4.16b, %[zero].16b, v0.16b, #12\n" \
+  "ext v5.16b, %[zero].16b, v1.16b, #12\n" \
+  "ext v6.16b, %[zero].16b, v2.16b, #12\n" \
+  "ext v7.16b, %[zero].16b, v3.16b, #12\n" \
+                                           \
+  "ext v8.16b, v0.16b, %[zero].16b, #4\n"  \
+  "ext v9.16b, v1.16b, %[zero].16b, #4\n"  \
+  "ext v10.16b, v2.16b, %[zero].16b, #4\n" \
+  "ext v11.16b, v3.16b, %[zero].16b, #4\n" \
+                                           \
+  "fmul v12.4s, v0.4s, %[wr0].s[1]\n"      \
+  "fmul v13.4s, v1.4s, %[wr0].s[1]\n"      \
+                                           \
+  "fmul v14.4s, v1.4s, %[wr1].s[1]\n"      \
+  "fmul v15.4s, v2.4s, %[wr1].s[1]\n"      \
+                                           \
+  "fmul v16.4s, v2.4s, %[wr2].s[1]\n"      \
+  "fmul v17.4s, v3.4s, %[wr2].s[1]\n"      \
+                                           \
+  "fmla v12.4s, v4.4s, %[wr0].s[0]\n"      \
+  "fmla v13.4s, v5.4s, %[wr0].s[0]\n"      \
+                                           \
+  "fmla v14.4s, v5.4s, %[wr1].s[0]\n"      \
+  "fmla v15.4s, v6.4s, %[wr1].s[0]\n"      \
+                                           \
+  "fmla v16.4s, v6.4s, %[wr2].s[0]\n"      \
+  "fmla v17.4s, v7.4s, %[wr2].s[0]\n"      \
+                                           \
+  "fmla v12.4s, v8.4s, %[wr0].s[2]\n"      \
+  "fmla v13.4s, v9.4s, %[wr0].s[2]\n"      \
+                                           \
+  "fmla v14.4s, v9.4s, %[wr1].s[2]\n"      \
+  "fmla v15.4s, v10.4s, %[wr1].s[2]\n"     \
+                                           \
+  "fmla v16.4s, v10.4s, %[wr2].s[2]\n"     \
+  "fmla v17.4s, v11.4s, %[wr2].s[2]\n"     \
+                                           \
+  "fadd v12.4s, v12.4s, v14.4s\n"          \
+  "fadd v12.4s, v12.4s, v16.4s\n"          \
+                                           \
+  "fadd v13.4s, v13.4s, v15.4s\n"          \
+  "fadd v13.4s, v13.4s, v17.4s\n"          \
+                                           \
+  "fadd v12.4s, v12.4s, %[bias].4s\n"      \
+  "fadd v13.4s, v13.4s, %[bias].4s\n"
+
+#define RESULT_S_S1             \
+  "prfm pldl1keep, [%[out1]]\n" \
+  "prfm pldl1keep, [%[out2]]\n" \
+                                \
+  "st1 {v12.4s}, [%[out1]]\n"   \
+  "st1 {v13.4s}, [%[out2]]\n"
+
+#define RESULT_S_S1_RELU              \
+  "prfm pldl1keep, [%[out1]]\n"       \
+  "prfm pldl1keep, [%[out2]]\n"       \
+                                      \
+  "fmax v12.4s, v12.4s, %[zero].4s\n" \
+  "fmax v13.4s, v13.4s, %[zero].4s\n" \
+                                      \
+  "st1 {v12.4s}, [%[out1]]\n"         \
+  "st1 {v13.4s}, [%[out2]]\n"
+
+#define COMPUTE_S_S1_P0                                   \
+  "prfm pldl1keep, [%[din0]]\n"                           \
+  "prfm pldl1keep, [%[din1]]\n"                           \
+  "prfm pldl1keep, [%[din2]]\n"                           \
+  "prfm pldl1keep, [%[din3]]\n"                           \
+                                                          \
+  "ld1 {v0.4s, v1.4s}, [%[din0]]\n"                       \
+  "ld1 {v2.4s, v3.4s}, [%[din1]]\n"                       \
+  "ld1 {v4.4s, v5.4s}, [%[din2]]\n"                       \
+  "ld1 {v6.4s, v7.4s}, [%[din3]]\n"                       \
+                                                          \
+  "bif v0.16b, %[zero].16b, %[mask1].16b\n"               \
+  "bif v1.16b, %[zero].16b, %[mask2].16b\n"               \
+                                                          \
+  "bif v2.16b, %[zero].16b, %[mask1].16b\n"               \
+  "bif v3.16b, %[zero].16b, %[mask2].16b\n"               \
+                                                          \
+  "bif v4.16b, %[zero].16b, %[mask1].16b\n"               \
+  "bif v5.16b, %[zero].16b, %[mask2].16b\n"               \
+                                                          \
+  "bif v6.16b, %[zero].16b, %[mask1].16b\n"               \
+  "bif v7.16b, %[zero].16b, %[mask2].16b\n"               \
+                                                          \
+  "ext v8.16b, v0.16b, v1.16b, #4\n"                      \
+  "ext v9.16b, v0.16b, v1.16b, #8\n"                      \
+                                                          \
+  "and  v12.16b, %[vbias].16b, %[vbias].16b  \n"          \
+  "and  v13.16b, %[vbias].16b, %[vbias].16b  \n" /* r0 */ \
+  "fmul v10.4s, v0.4s, %[wr0].s[0]\n"                     \
+  "fmul v11.4s, v8.4s, %[wr0].s[1]\n"                     \
+  "fmla v12.4s, v9.4s, %[wr0].s[2]\n"                     \
+                                                          \
+  "ext v8.16b, v2.16b, v3.16b, #4\n"                      \
+  "ext v9.16b, v2.16b, v3.16b, #8\n" /* r1 */             \
+  "fmul v14.4s, v2.4s, %[wr0].s[0]\n"                     \
+  "fmla v10.4s, v2.4s, %[wr1].s[0]\n"                     \
+                                                          \
+  "fmul v15.4s, v8.4s, %[wr0].s[1]\n"                     \
+  "fmla v11.4s, v8.4s, %[wr1].s[1]\n"                     \
+                                                          \
+  "fmla v13.4s, v9.4s, %[wr0].s[2]\n"                     \
+  "fmla v12.4s, v9.4s, %[wr1].s[2]\n"                     \
+                                                          \
+  "ext v8.16b, v4.16b, v5.16b, #4\n"                      \
+  "ext v9.16b, v4.16b, v5.16b, #8\n" /* r2 */             \
+  "fmla v14.4s, v4.4s, %[wr1].s[0]\n"                     \
+  "fmla v10.4s, v4.4s, %[wr2].s[0]\n"                     \
+                                                          \
+  "fmla v15.4s, v8.4s, %[wr1].s[1]\n"                     \
+  "fmla v11.4s, v8.4s, %[wr2].s[1]\n"                     \
+                                                          \
+  "fmla v13.4s, v9.4s, %[wr1].s[2]\n"                     \
+  "fmla v12.4s, v9.4s, %[wr2].s[2]\n"                     \
+                                                          \
+  "ext v8.16b, v6.16b, v7.16b, #4\n"                      \
+  "ext v9.16b, v6.16b, v7.16b, #8\n"                      \
+                                                          \
+  "fmla v14.4s, v6.4s, %[wr2].s[0]\n"                     \
+                                                          \
+  "fmla v15.4s, v8.4s, %[wr2].s[1]\n"                     \
+                                                          \
+  "fadd v12.4s, v12.4s, v10.4s\n"                         \
+                                                          \
+  "fmla v13.4s, v9.4s, %[wr2].s[2]\n"                     \
+                                                          \
+  "fadd v12.4s, v12.4s, v11.4s\n"                         \
+  "fadd v13.4s, v13.4s, v14.4s\n"                         \
+  "fadd v13.4s, v13.4s, v15.4s\n"  // \
+                    // "prfm pldl1keep, [%[out1]]\n" \
+                    // "prfm pldl1keep, [%[out2]]\n" \
+                    // \
+                    // "st1 {v12.4s}, [%[out1]]\n" \
+                    // "st1 {v13.4s}, [%[out2]]\n" \
+
+
+#else
+#define INIT_S1                                                    \
+  "pld [%[din0_ptr]]                             @ preload data\n" \
+  "pld [%[din1_ptr]]                      @ preload data\n"        \
+  "pld [%[din2_ptr]]                      @ preload data\n"        \
+  "pld [%[din3_ptr]]                      @ preload data\n"        \
+                                                                   \
+  "vld1.32  {d16-d18}, [%[din0_ptr]]!    @ load din r0\n"          \
+  "vld1.32  {d20-d22}, [%[din1_ptr]]!    @ load din r1\n"          \
+  "vld1.32  {d24-d26}, [%[din2_ptr]]!    @ load din r2\n"          \
+  "vld1.32  {d28-d30}, [%[din3_ptr]]!    @ load din r3\n"          \
+                                                                   \
+  "vdup.32 q4, %[bias_val]                            @ and \n"    \
+  "vdup.32 q5, %[bias_val]                            @ and \n"
+
+#define LEFT_COMPUTE_S1                                            \
+  "vext.32  q6, %q[vzero], q8, #3     @ 0012\n"                    \
+  "vext.32  q7, q8, q9, #1     @ 1234\n" /* r0 */                  \
+  "vmla.f32 q4, q8, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"           \
+                                                                   \
+  "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n"           \
+  "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n"           \
+  "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n"           \
+  "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n"           \
+                                                                   \
+  "vmla.f32 q4, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"           \
+                                                                   \
+  "pld [%[din0_ptr]]                             @ preload data\n" \
+  "pld [%[din1_ptr]]                             @ preload data\n" \
+  "pld [%[din2_ptr]]                             @ preload data\n" \
+  "pld [%[din3_ptr]]                             @ preload data\n" \
+                                                                   \
+  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"           \
+                                                                   \
+  "vext.32  q6, %q[vzero], q10, #3     @ 0012\n"                   \
+  "vext.32  q7, q10, q11, #1     @ 1234\n"                         \
+                                                                   \
+  /* r1 */                                                         \
+  "vmla.f32 q5, q10, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"          \
+  "vmla.f32 q4, q10, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"          \
+                                                                   \
+  "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"          \
+  "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"          \
+                                                                   \
+  "vmla.f32 q5, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"           \
+  "vmla.f32 q4, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"           \
+                                                                   \
+  "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"               \
+  "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"               \
+                                                                   \
+  "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"           \
+  "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"           \
+                                                                   \
+  "vext.32  q6, %q[vzero], q12, #3     @ 0012\n"                   \
+  "vext.32  q7, q12, q13, #1     @ 1234\n"                         \
+                                                                   \
+  /* r2 */                                                         \
+  "vmla.f32 q5, q12, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"          \
+  "vmla.f32 q4, q12, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"          \
+                                                                   \
+  "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"          \
+                                                                   \
+  "vmla.f32 q5, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"           \
+  "vmla.f32 q4, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"           \
+                                                                   \
+  "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"               \
+                                                                   \
+  "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"           \
+  "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"           \
+                                                                   \
+  "vext.32  q6, %q[vzero], q14, #3     @ 0012\n"                   \
+  "vext.32  q7, q14, q15, #1     @ 1234\n"
+
+#define LEFT_RESULT_S1                                                        \
+  /* r3 */                                                                    \
+  "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                     \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"        \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"                      \
+                                                                              \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                          \
+  "vdup.32 q4, %[bias_val]                            @ and \n"               \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"                      \
+                                                                              \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                      \
+  "cmp %[cnt], #1                             @ check whether has mid cols\n" \
+                                                                              \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"      \
+                                                                              \
+  "vdup.32 q5, %[bias_val]                            @ and \n"               \
+  "blt  3f                                @ jump to main loop start point\n"
+
+#define MID_COMPUTE_S1                                                 \
+  "1:                                    @ right pad entry\n" /* r0 */ \
+  "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"               \
+                                                                       \
+  "pld [%[din0_ptr]]                             @ preload data\n"     \
+  "pld [%[din1_ptr]]                             @ preload data\n"     \
+  "pld [%[din2_ptr]]                             @ preload data\n"     \
+  "pld [%[din3_ptr]]                             @ preload data\n"     \
+                                                                       \
+  "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"              \
+                                                                       \
+  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"               \
+                                                                       \
+  "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"                   \
+                                                                       \
+  "vext.32  q6, q10, q11, #1     @ 1234\n"                             \
+  "vext.32  q7, q10, q11, #2     @ 2345\n" /* r1 */                    \
+  "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"              \
+  "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"              \
+                                                                       \
+  "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"              \
+                                                                       \
+  "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"               \
+  "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"                   \
+                                                                       \
+  "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"               \
+  "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vext.32  q6, q12, q13, #1     @ 1234\n"                             \
+  "vext.32  q7, q12, q13, #2     @ 2345\n" /* r2 */                    \
+  "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"              \
+  "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"              \
+                                                                       \
+  "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"              \
+                                                                       \
+  "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"               \
+  "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"                   \
+                                                                       \
+  "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"               \
+  "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vext.32  q6, q14, q15, #1     @ 1234\n"                             \
+  "vext.32  q7, q14, q15, #2     @ 2345\n"
+
+#define MID_RESULT_S1                                                    \
+  /* r3 */                                                               \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
+                                                                         \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
+                                                                         \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                 \
+                                                                         \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                     \
+  "vdup.32 q4, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                 \
+                                                                         \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                 \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                 \
+                                                                         \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n" \
+                                                                         \
+  "subs %[cnt], #1 @ loop count minus 1\n"                               \
+                                                                         \
+  "vdup.32 q5, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "bne    1b                             @ jump to main loop start point\n"
+
+#define RIGHT_COMPUTE_S1                                                      \
+  "3:                                    @ right pad entry\n"                 \
+  "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"                            \
+  "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"                            \
+  "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vbif d16, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d17, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d18, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vbif d20, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d21, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d22, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n" /* r0 */                             \
+  "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"                      \
+                                                                              \
+  "vbif d24, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d25, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d26, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vbif d28, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d29, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d30, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"                      \
+                                                                              \
+  "vext.32  q6, q10, q11, #1     @ 1234\n"                                    \
+  "vext.32  q7, q10, q11, #2     @ 2345\n" /* r1 */                           \
+  "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"                     \
+  "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"                            \
+  "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"                     \
+  "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"                     \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vext.32  q6, q12, q13, #1     @ 1234\n"                                    \
+  "vext.32  q7, q12, q13, #2     @ 2345\n" /* r2 */                           \
+  "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"                     \
+  "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vext.32  q6, q14, q15, #1     @ 1234\n"                                    \
+  "vext.32  q7, q14, q15, #2     @ 2345\n"
+
+#define RIGHT_RESULT_S1                                                 \
+  /* r3 */                                                              \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"               \
+                                                                        \
+  "vbif d8, d16, d19              @ bit select, deal with right pad\n"  \
+  "vbif d9, d17, d23              @ bit select, deal with right pad\n"  \
+                                                                        \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                \
+                                                                        \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"  \
+                                                                        \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                \
+                                                                        \
+  "vbif d10, d20, d19              @ bit select, deal with right pad\n" \
+  "vbif d11, d21, d23              @ bit select, deal with right pad\n" \
+                                                                        \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"
+
+#define LEFT_RESULT_S1_RELU                                                   \
+  /* r3 */                                                                    \
+  "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                     \
+  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                                    \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"                      \
+                                                                              \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                          \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"        \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"                      \
+                                                                              \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                      \
+  "vdup.32 q4, %[bias_val]                            @ and \n"               \
+                                                                              \
+  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                                    \
+                                                                              \
+  "cmp %[cnt], #1                             @ check whether has mid cols\n" \
+                                                                              \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"      \
+                                                                              \
+  "vdup.32 q5, %[bias_val]                            @ and \n"               \
+  "blt  3f                                @ jump to main loop start point\n"
+
+#define MID_RESULT_S1_RELU                                               \
+  /* r3 */                                                               \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
+                                                                         \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
+  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                               \
+                                                                         \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                 \
+                                                                         \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                     \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
+                                                                         \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                 \
+                                                                         \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                 \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                 \
+  "vdup.32 q4, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                               \
+                                                                         \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n" \
+                                                                         \
+  "subs %[cnt], #1 @ loop count minus 1\n"                               \
+                                                                         \
+  "vdup.32 q5, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "bne    1b                             @ jump to main loop start point\n"
+
+#define RIGHT_RESULT_S1_RELU                                            \
+  /* r3 */                                                              \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"               \
+                                                                        \
+  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                              \
+                                                                        \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                \
+                                                                        \
+  "vbif d8, d16, d19              @ bit select, deal with right pad\n"  \
+  "vbif d9, d17, d23              @ bit select, deal with right pad\n"  \
+                                                                        \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"  \
+                                                                        \
+  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                              \
+                                                                        \
+  "vbif d10, d20, d19              @ bit select, deal with right pad\n" \
+  "vbif d11, d21, d23              @ bit select, deal with right pad\n" \
+                                                                        \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"
+
+#define COMPUTE_S_S1                 \
+  "pld [%[din0]]\n"                  \
+  "pld [%[din1]]\n"                  \
+  "pld [%[din2]]\n"                  \
+  "pld [%[din3]]\n"                  \
+                                     \
+  "vld1.32 {d12-d13}, [%[din0]]!\n"  \
+  "vld1.32 {d14-d15}, [%[din1]]!\n"  \
+  "vld1.32 {d16-d17}, [%[din2]]!\n"  \
+  "vld1.32 {d18-d19}, [%[din3]]!\n"  \
+                                     \
+  "vbif q6, %q[vzero], %q[mask]\n"   \
+  "vbif q7, %q[vzero], %q[mask]\n"   \
+  "vbif q8, %q[vzero], %q[mask]\n"   \
+  "vbif q9, %q[vzero], %q[mask]\n"   \
+                                     \
+  "vmul.f32 q14, q6, %e[wr0][1]\n"   \
+  "vmul.f32 q15, q7, %e[wr0][1]\n"   \
+                                     \
+  "vmla.f32 q14, q7, %e[wr1][1]\n"   \
+  "vmla.f32 q15, q8, %e[wr1][1]\n"   \
+                                     \
+  "vmla.f32 q14, q8, %e[wr2][1]\n"   \
+  "vmla.f32 q15, q9, %e[wr2][1]\n"   \
+                                     \
+  "vext.32 q10, %q[vzero], q6, #3\n" \
+  "vext.32 q11, %q[vzero], q7, #3\n" \
+  "vext.32 q12, %q[vzero], q8, #3\n" \
+  "vext.32 q13, %q[vzero], q9, #3\n" \
+                                     \
+  "vmla.f32 q14, q10, %e[wr0][0]\n"  \
+  "vmla.f32 q15, q11, %e[wr0][0]\n"  \
+                                     \
+  "vmla.f32 q14, q11, %e[wr1][0]\n"  \
+  "vmla.f32 q15, q12, %e[wr1][0]\n"  \
+                                     \
+  "vmla.f32 q14, q12, %e[wr2][0]\n"  \
+  "vmla.f32 q15, q13, %e[wr2][0]\n"  \
+                                     \
+  "vext.32 q10, q6, %q[vzero], #1\n" \
+  "vext.32 q11, q7, %q[vzero], #1\n" \
+  "vext.32 q12, q8, %q[vzero], #1\n" \
+  "vext.32 q13, q9, %q[vzero], #1\n" \
+                                     \
+  "vmla.f32 q14, q10, %f[wr0][0]\n"  \
+  "vmla.f32 q15, q11, %f[wr0][0]\n"  \
+                                     \
+  "vmla.f32 q14, q11, %f[wr1][0]\n"  \
+  "vmla.f32 q15, q12, %f[wr1][0]\n"  \
+                                     \
+  "vmla.f32 q14, q12, %f[wr2][0]\n"  \
+  "vmla.f32 q15, q13, %f[wr2][0]\n"  \
+                                     \
+  "vadd.f32 q14, q14, %q[bias]\n"    \
+  "vadd.f32 q15, q15, %q[bias]\n"
+
+#define RESULT_S_S1                \
+  "pld [%[out1]]\n"                \
+  "pld [%[out2]]\n"                \
+                                   \
+  "vst1.32 {d28-d29}, [%[out1]]\n" \
+  "vst1.32 {d30-d31}, [%[out2]]\n"
+
+#define RESULT_S_S1_RELU           \
+  "pld [%[out1]]\n"                \
+  "pld [%[out2]]\n"                \
+                                   \
+  "vmax.f32 q14, q14, %q[vzero]\n" \
+  "vmax.f32 q15, q15, %q[vzero]\n" \
+                                   \
+  "vst1.32 {d28-d29}, [%[out1]]\n" \
+  "vst1.32 {d30-d31}, [%[out2]]\n"
+
+#define COMPUTE_S_S1_P0                                                       \
+  "pld [%[din0]]\n"                                                           \
+  "pld [%[din1]]\n"                                                           \
+  "pld [%[din2]]\n"                                                           \
+  "pld [%[din3]]\n"                                                           \
+  "vld1.32  {d16-d18}, [%[din0]]    @ load din r0\n"                          \
+  "vld1.32  {d20-d22}, [%[din1]]    @ load din r1\n"                          \
+  "vld1.32  {d24-d26}, [%[din2]]    @ load din r2\n"                          \
+  "vld1.32  {d28-d30}, [%[din3]]    @ load din r3\n"                          \
+                                                                              \
+  "vdup.32 q4, %[bias_val]                            @ and \n"               \
+  "vdup.32 q5, %[bias_val]                            @ and \n"               \
+                                                                              \
+  "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"                            \
+  "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vbif d16, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d20, %e[vzero], d19              @ bit select, deal with right pad\n" \
+                                                                              \
+  "vbif d17, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d21, %e[vzero], d23              @ bit select, deal with right pad\n" \
+                                                                              \
+  "vbif d18, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+  "vbif d22, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n" /* r0 */                             \
+  "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"                      \
+                                                                              \
+  "vbif d24, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d25, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d26, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vbif d28, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d29, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d30, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"                      \
+                                                                              \
+  "vext.32  q6, q10, q11, #1     @ 1234\n"                                    \
+  "vext.32  q7, q10, q11, #2     @ 2345\n" /* r1 */                           \
+  "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"                     \
+  "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmul.f32 q8, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmul.f32 q10, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmul.f32 q9, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmul.f32 q11, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vext.32  q6, q12, q13, #1     @ 1234\n"                                    \
+  "vext.32  q7, q12, q13, #2     @ 2345\n" /* r2 */                           \
+  "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"                     \
+  "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmla.f32 q8, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q10, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmla.f32 q9, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q11, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vext.32  q6, q14, q15, #1     @ 1234\n"                                    \
+  "vext.32  q7, q14, q15, #2     @ 2345\n" /* r3 */                           \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                     \
+                                                                              \
+  "vmla.f32 q8, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vadd.f32 q4, q4, q10         @ q4 += q10 \n"                               \
+                                                                              \
+  "pld [%[out1]]\n"                                                           \
+  "pld [%[out2]]\n"                                                           \
+                                                                              \
+  "vmla.f32 q9, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                      \
+  "vadd.f32 q14, q4, q11         @ q4 += q10 \n"                              \
+                                                                              \
+  "vadd.f32 q5, q5, q8         @ q4 += q10 \n"                                \
+  "vadd.f32 q15, q5, q9         @ q4 += q10 \n"
+
+#endif
+/**
+ * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
+ * width > 4
+ */
+void conv_depthwise_3x3s1p1_bias_relu(float *dout,
+                                      const float *din,
+                                      const float *weights,
+                                      const float *bias,
+                                      bool flag_bias,
+                                      bool flag_relu,
+                                      const int num,
+                                      const int ch_in,
+                                      const int h_in,
+                                      const int w_in,
+                                      const int h_out,
+                                      const int w_out,
+                                      ARMContext *ctx) {
+  //! pad is done implicit
+  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+  //! for 4x6 convolution window
+  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
+
+  float *zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float *write_ptr = zero_ptr + w_in;
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  int w_stride = 9;
+
+  int tile_w = w_out >> 2;
+  int remain = w_out % 4;
+  int cnt_col = tile_w - 1;
+
+  unsigned int size_pad_right = (unsigned int)(5 + (tile_w << 2) - w_in);
+  const unsigned int remian_idx[4] = {0, 1, 2, 3};
+
+  if (remain == 0 && size_pad_right == 5) {
+    size_pad_right = 1;
+    cnt_col -= 1;
+    remain = 4;
+  } else if (remain == 0 && size_pad_right == 6) {
+    size_pad_right = 2;
+    cnt_col -= 1;
+    remain = 4;
+  }
+
+  uint32x4_t vmask_rp1 =
+      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_rp2 =
+      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_result =
+      vcgtq_u32(vdupq_n_u32(remain), vld1q_u32(remian_idx));
+
+  unsigned int vmask[8];
+  vst1q_u32(vmask, vmask_rp1);
+  vst1q_u32(vmask + 4, vmask_rp2);
+
+  unsigned int rmask[4];
+  vst1q_u32(rmask, vmask_result);
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int c = 0; c < ch_in; c++) {
+      float *dout_ptr = dout_batch + c * size_out_channel;
+
+      const float *din_ch_ptr = din_batch + c * size_in_channel;
+
+      float bias_val = flag_bias ? bias[c] : 0.f;
+      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
+
+      const float *wei_ptr = weights + c * w_stride;
+
+      float32x4_t wr0 = vld1q_f32(wei_ptr);
+      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
+
+      float *doutr0 = dout_ptr;
+      float *doutr1 = doutr0 + w_out;
+      float *doutr2 = doutr1 + w_out;
+      float *doutr3 = doutr2 + w_out;
+
+      const float *dr0 = din_ch_ptr;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+      const float *dr3 = dr2 + w_in;
+      const float *dr4 = dr3 + w_in;
+      const float *dr5 = dr4 + w_in;
+
+      const float *din_ptr0 = dr0;
+      const float *din_ptr1 = dr1;
+      const float *din_ptr2 = dr2;
+      const float *din_ptr3 = dr3;
+      const float *din_ptr4 = dr4;
+      const float *din_ptr5 = dr5;
+      float *ptr_zero = const_cast<float *>(zero);
+#ifdef __aarch64__
+      for (int i = 0; i < h_in; i += 4) {
+        //! process top pad pad_h = 1
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+        din_ptr4 = dr4;
+        din_ptr5 = dr5;
+
+        doutr0 = dout_ptr;
+        doutr1 = doutr0 + w_out;
+        doutr2 = doutr1 + w_out;
+        doutr3 = doutr2 + w_out;
+        if (i == 0) {
+          din_ptr0 = zero_ptr;
+          din_ptr1 = dr0;
+          din_ptr2 = dr1;
+          din_ptr3 = dr2;
+          din_ptr4 = dr3;
+          din_ptr5 = dr4;
+          dr0 = dr3;
+          dr1 = dr4;
+          dr2 = dr5;
+        } else {
+          dr0 = dr4;
+          dr1 = dr5;
+          dr2 = dr1 + w_in;
+        }
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+        dr5 = dr4 + w_in;
+
+        //! process bottom pad
+        if (i + 5 > h_in) {
+          switch (i + 5 - h_in) {
+            case 5:
+              din_ptr1 = zero_ptr;
+            case 4:
+              din_ptr2 = zero_ptr;
+            case 3:
+              din_ptr3 = zero_ptr;
+            case 2:
+              din_ptr4 = zero_ptr;
+            case 1:
+              din_ptr5 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 4 > h_out) {
+          switch (i + 4 - h_out) {
+            case 3:
+              doutr1 = write_ptr;
+            case 2:
+              doutr2 = write_ptr;
+            case 1:
+              doutr3 = write_ptr;
+            default:
+              break;
+          }
+        }
+
+        int cnt = cnt_col;
+        if (flag_relu) {
+          asm volatile(
+              INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
+                  MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
+              : [cnt] "+r"(cnt),
+                [din_ptr0] "+r"(din_ptr0),
+                [din_ptr1] "+r"(din_ptr1),
+                [din_ptr2] "+r"(din_ptr2),
+                [din_ptr3] "+r"(din_ptr3),
+                [din_ptr4] "+r"(din_ptr4),
+                [din_ptr5] "+r"(din_ptr5),
+                [doutr0] "+r"(doutr0),
+                [doutr1] "+r"(doutr1),
+                [doutr2] "+r"(doutr2),
+                [doutr3] "+r"(doutr3)
+              : [w0] "w"(wr0),
+                [w1] "w"(wr1),
+                [w2] "w"(wr2),
+                [bias_val] "r"(vbias),
+                [vmask] "r"(vmask),
+                [rmask] "r"(rmask),
+                [vzero] "w"(vzero)
+              : "cc",
+                "memory",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11",
+                "v12",
+                "v13",
+                "v14",
+                "v15",
+                "v16",
+                "v17",
+                "v18",
+                "v19",
+                "v20",
+                "v21",
+                "v22",
+                "v23",
+                "v24",
+                "v25");
+        } else {
+          asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
+                           MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
+                       : [cnt] "+r"(cnt),
+                         [din_ptr0] "+r"(din_ptr0),
+                         [din_ptr1] "+r"(din_ptr1),
+                         [din_ptr2] "+r"(din_ptr2),
+                         [din_ptr3] "+r"(din_ptr3),
+                         [din_ptr4] "+r"(din_ptr4),
+                         [din_ptr5] "+r"(din_ptr5),
+                         [doutr0] "+r"(doutr0),
+                         [doutr1] "+r"(doutr1),
+                         [doutr2] "+r"(doutr2),
+                         [doutr3] "+r"(doutr3)
+                       : [w0] "w"(wr0),
+                         [w1] "w"(wr1),
+                         [w2] "w"(wr2),
+                         [bias_val] "r"(vbias),
+                         [vmask] "r"(vmask),
+                         [rmask] "r"(rmask),
+                         [vzero] "w"(vzero)
+                       : "cc",
+                         "memory",
+                         "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15",
+                         "v16",
+                         "v17",
+                         "v18",
+                         "v19",
+                         "v20",
+                         "v21",
+                         "v22",
+                         "v23",
+                         "v24",
+                         "v25");
+        }
+        dout_ptr = dout_ptr + 4 * w_out;
+      }
+#else
+      for (int i = 0; i < h_in; i += 2) {
+        //! process top pad pad_h = 1
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+
+        doutr0 = dout_ptr;
+        doutr1 = dout_ptr + w_out;
+        // unsigned int* rst_mask = rmask;
+
+        if (i == 0) {
+          din_ptr0 = zero_ptr;
+          din_ptr1 = dr0;
+          din_ptr2 = dr1;
+          din_ptr3 = dr2;
+          dr0 = dr1;
+          dr1 = dr2;
+          dr2 = dr3;
+          dr3 = dr2 + w_in;
+        } else {
+          dr0 = dr2;
+          dr1 = dr3;
+          dr2 = dr1 + w_in;
+          dr3 = dr2 + w_in;
+        }
+        //! process bottom pad
+        if (i + 3 > h_in) {
+          switch (i + 3 - h_in) {
+            case 3:
+              din_ptr1 = zero_ptr;
+            case 2:
+              din_ptr2 = zero_ptr;
+            case 1:
+              din_ptr3 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 2 > h_out) {
+          doutr1 = write_ptr;
+        }
+        int cnt = cnt_col;
+        unsigned int *rmask_ptr = rmask;
+        unsigned int *vmask_ptr = vmask;
+        if (flag_relu) {
+          asm volatile(
+              INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
+                  MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
+              : [dout_ptr1] "+r"(doutr0),
+                [dout_ptr2] "+r"(doutr1),
+                [din0_ptr] "+r"(din_ptr0),
+                [din1_ptr] "+r"(din_ptr1),
+                [din2_ptr] "+r"(din_ptr2),
+                [din3_ptr] "+r"(din_ptr3),
+                [cnt] "+r"(cnt),
+                [rmask] "+r"(rmask_ptr),
+                [vmask] "+r"(vmask_ptr)
+              : [wr0] "w"(wr0),
+                [wr1] "w"(wr1),
+                [wr2] "w"(wr2),
+                [bias_val] "r"(bias_val),
+                [vzero] "w"(vzero)
+              : "cc",
+                "memory",
+                "q4",
+                "q5",
+                "q6",
+                "q7",
+                "q8",
+                "q9",
+                "q10",
+                "q11",
+                "q12",
+                "q13",
+                "q14",
+                "q15");
+        } else {
+          asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
+                           MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
+                       : [dout_ptr1] "+r"(doutr0),
+                         [dout_ptr2] "+r"(doutr1),
+                         [din0_ptr] "+r"(din_ptr0),
+                         [din1_ptr] "+r"(din_ptr1),
+                         [din2_ptr] "+r"(din_ptr2),
+                         [din3_ptr] "+r"(din_ptr3),
+                         [cnt] "+r"(cnt),
+                         [rmask] "+r"(rmask_ptr),
+                         [vmask] "+r"(vmask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias_val] "r"(bias_val),
+                         [vzero] "w"(vzero)
+                       : "cc",
+                         "memory",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+        dout_ptr += 2 * w_out;
+      }  //! end of processing mid rows
+#endif
+    }
+  }
+}
+
+/**
+ * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
+ * width <= 4
+ */
+void conv_depthwise_3x3s1p1_bias_s_relu(float *dout,
+                                        const float *din,
+                                        const float *weights,
+                                        const float *bias,
+                                        bool flag_bias,
+                                        bool flag_relu,
+                                        const int num,
+                                        const int ch_in,
+                                        const int h_in,
+                                        const int w_in,
+                                        const int h_out,
+                                        const int w_out,
+                                        ARMContext *ctx) {
+  //! 3x3s1 convolution, implemented by direct algorithm
+  //! pad is done implicit
+  //! for 4x6 convolution window
+  const int right_pad_idx[4] = {3, 2, 1, 0};
+  const float zero[4] = {0.f, 0.f, 0.f, 0.f};
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  uint32x4_t vmask_rp =
+      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in));
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      float *dout_channel = dout_batch + i * size_out_channel;
+      const float *din_channel = din_batch + i * size_in_channel;
+      const float *weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+
+      int hs = -1;
+      int he = 3;
+
+      float out_buf1[4];
+      float out_buf2[4];
+      float trash_buf[4];
+
+      int h_cnt = (h_out + 1) >> 1;
+      float *doutr0 = dout_channel;
+      float *doutr1 = dout_channel + w_out;
+
+      for (int j = 0; j < h_cnt; ++j) {
+        const float *dr0 = din_channel + hs * w_in;
+        const float *dr1 = dr0 + w_in;
+        const float *dr2 = dr1 + w_in;
+        const float *dr3 = dr2 + w_in;
+
+        if (hs == -1) {
+          dr0 = zero;
+        }
+
+        switch (he - h_in) {
+          case 2:
+            dr2 = zero;
+            doutr1 = trash_buf;
+          case 1:
+            dr3 = zero;
+          default:
+            break;
+        }
+#ifdef __aarch64__
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [zero] "w"(vzero),
+                         [mask] "w"(vmask_rp),
+                         [bias] "w"(wbias),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15",
+                         "v16",
+                         "v17");
+        } else {
+          asm volatile(COMPUTE_S_S1 RESULT_S_S1
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [zero] "w"(vzero),
+                         [mask] "w"(vmask_rp),
+                         [bias] "w"(wbias),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15",
+                         "v16",
+                         "v17");
+        }
+#else
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [vzero] "w"(vzero),
+                         [mask] "w"(vmask_rp),
+                         [bias] "w"(wbias),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "cc",
+                         "memory",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        } else {
+          asm volatile(COMPUTE_S_S1 RESULT_S_S1
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [vzero] "w"(vzero),
+                         [mask] "w"(vmask_rp),
+                         [bias] "w"(wbias),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "cc",
+                         "memory",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *doutr0++ = out_buf1[w];
+          *doutr1++ = out_buf2[w];
+        }
+        doutr0 = doutr1;
+        doutr1 += w_out;
+        hs += 2;
+        he += 2;
+      }  // end of processing heights
+    }    // end of processing channels
+  }      // end of processing batchs
+}
+
+/**
+ * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
+ * width > 4
+ */
+void conv_depthwise_3x3s1p0_bias_relu(float *dout,
+                                      const float *din,
+                                      const float *weights,
+                                      const float *bias,
+                                      bool flag_bias,
+                                      bool flag_relu,
+                                      const int num,
+                                      const int ch_in,
+                                      const int h_in,
+                                      const int w_in,
+                                      const int h_out,
+                                      const int w_out,
+                                      ARMContext *ctx) {
+  //! pad is done implicit
+  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+  //! for 4x6 convolution window
+  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
+
+  float *zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float *write_ptr = zero_ptr + w_in;
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  int w_stride = 9;
+
+  int tile_w = w_out >> 2;
+  int remain = w_out % 4;
+
+  unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in);
+  const int remian_idx[4] = {0, 1, 2, 3};
+
+  uint32x4_t vmask_rp1 =
+      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_rp2 =
+      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_result =
+      vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx));
+
+  unsigned int vmask[8];
+  vst1q_u32(vmask, vmask_rp1);
+  vst1q_u32(vmask + 4, vmask_rp2);
+
+  unsigned int rmask[4];
+  vst1q_u32(rmask, vmask_result);
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int c = 0; c < ch_in; c++) {
+      float *dout_ptr = dout_batch + c * size_out_channel;
+
+      const float *din_ch_ptr = din_batch + c * size_in_channel;
+
+      float bias_val = flag_bias ? bias[c] : 0.f;
+      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
+
+      const float *wei_ptr = weights + c * w_stride;
+
+      float32x4_t wr0 = vld1q_f32(wei_ptr);
+      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
+
+      float *doutr0 = dout_ptr;
+      float *doutr1 = doutr0 + w_out;
+      float *doutr2 = doutr1 + w_out;
+      float *doutr3 = doutr2 + w_out;
+
+      const float *dr0 = din_ch_ptr;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+      const float *dr3 = dr2 + w_in;
+      const float *dr4 = dr3 + w_in;
+      const float *dr5 = dr4 + w_in;
+
+      const float *din_ptr0 = dr0;
+      const float *din_ptr1 = dr1;
+      const float *din_ptr2 = dr2;
+      const float *din_ptr3 = dr3;
+      const float *din_ptr4 = dr4;
+      const float *din_ptr5 = dr5;
+
+      float *ptr_zero = const_cast<float *>(zero);
+#ifdef __aarch64__
+      for (int i = 0; i < h_out; i += 4) {
+        //! process top pad pad_h = 1
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+        din_ptr4 = dr4;
+        din_ptr5 = dr5;
+
+        doutr0 = dout_ptr;
+        doutr1 = doutr0 + w_out;
+        doutr2 = doutr1 + w_out;
+        doutr3 = doutr2 + w_out;
+
+        dr0 = dr4;
+        dr1 = dr5;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+        dr5 = dr4 + w_in;
+
+        //! process bottom pad
+        if (i + 5 >= h_in) {
+          switch (i + 5 - h_in) {
+            case 4:
+              din_ptr1 = zero_ptr;
+            case 3:
+              din_ptr2 = zero_ptr;
+            case 2:
+              din_ptr3 = zero_ptr;
+            case 1:
+              din_ptr4 = zero_ptr;
+            case 0:
+              din_ptr5 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 4 > h_out) {
+          switch (i + 4 - h_out) {
+            case 3:
+              doutr1 = write_ptr;
+            case 2:
+              doutr2 = write_ptr;
+            case 1:
+              doutr3 = write_ptr;
+            default:
+              break;
+          }
+        }
+
+        int cnt = tile_w;
+        if (flag_relu) {
+          asm volatile(
+              INIT_S1
+              "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+              "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+              "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+              "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+              "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+              "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+              MID_COMPUTE_S1 MID_RESULT_S1_RELU
+              "cmp  %w[remain], #1             \n"
+              "blt 0f                         \n" RIGHT_COMPUTE_S1
+                  RIGHT_RESULT_S1_RELU "0: \n"
+              : [cnt] "+r"(cnt),
+                [din_ptr0] "+r"(din_ptr0),
+                [din_ptr1] "+r"(din_ptr1),
+                [din_ptr2] "+r"(din_ptr2),
+                [din_ptr3] "+r"(din_ptr3),
+                [din_ptr4] "+r"(din_ptr4),
+                [din_ptr5] "+r"(din_ptr5),
+                [doutr0] "+r"(doutr0),
+                [doutr1] "+r"(doutr1),
+                [doutr2] "+r"(doutr2),
+                [doutr3] "+r"(doutr3)
+              : [w0] "w"(wr0),
+                [w1] "w"(wr1),
+                [w2] "w"(wr2),
+                [bias_val] "r"(vbias),
+                [vmask] "r"(vmask),
+                [rmask] "r"(rmask),
+                [vzero] "w"(vzero),
+                [remain] "r"(remain)
+              : "cc",
+                "memory",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11",
+                "v12",
+                "v13",
+                "v14",
+                "v15",
+                "v16",
+                "v17",
+                "v18",
+                "v19",
+                "v20",
+                "v21",
+                "v22",
+                "v23",
+                "v24",
+                "v25");
+        } else {
+          asm volatile(
+              INIT_S1
+              "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+              "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+              "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+              "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+              "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+              "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+              MID_COMPUTE_S1 MID_RESULT_S1
+              "cmp  %w[remain], #1             \n"
+              "blt 0f                         \n" RIGHT_COMPUTE_S1
+                  RIGHT_RESULT_S1 "0: \n"
+              : [cnt] "+r"(cnt),
+                [din_ptr0] "+r"(din_ptr0),
+                [din_ptr1] "+r"(din_ptr1),
+                [din_ptr2] "+r"(din_ptr2),
+                [din_ptr3] "+r"(din_ptr3),
+                [din_ptr4] "+r"(din_ptr4),
+                [din_ptr5] "+r"(din_ptr5),
+                [doutr0] "+r"(doutr0),
+                [doutr1] "+r"(doutr1),
+                [doutr2] "+r"(doutr2),
+                [doutr3] "+r"(doutr3)
+              : [w0] "w"(wr0),
+                [w1] "w"(wr1),
+                [w2] "w"(wr2),
+                [bias_val] "r"(vbias),
+                [vmask] "r"(vmask),
+                [rmask] "r"(rmask),
+                [vzero] "w"(vzero),
+                [remain] "r"(remain)
+              : "cc",
+                "memory",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11",
+                "v12",
+                "v13",
+                "v14",
+                "v15",
+                "v16",
+                "v17",
+                "v18",
+                "v19",
+                "v20",
+                "v21",
+                "v22",
+                "v23",
+                "v24",
+                "v25");
+        }
+        dout_ptr = dout_ptr + 4 * w_out;
+      }
+#else
+      for (int i = 0; i < h_out; i += 2) {
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+
+        doutr0 = dout_ptr;
+        doutr1 = dout_ptr + w_out;
+
+        dr0 = dr2;
+        dr1 = dr3;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        //! process bottom pad
+        if (i + 3 >= h_in) {
+          switch (i + 3 - h_in) {
+            case 3:
+              din_ptr1 = zero_ptr;
+            case 2:
+              din_ptr2 = zero_ptr;
+            case 1:
+              din_ptr3 = zero_ptr;
+            case 0:
+              din_ptr3 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 2 > h_out) {
+          doutr1 = write_ptr;
+        }
+        int cnt = tile_w;
+        unsigned int *rmask_ptr = rmask;
+        unsigned int *vmask_ptr = vmask;
+        if (flag_relu) {
+          asm volatile(INIT_S1
+                       "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "vext.32  q6, q8, q9, #1     @ 0012\n"
+                       "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
+                           MID_RESULT_S1_RELU
+                       "cmp  %[remain], #1             \n"
+                       "blt 0f                         \n" RIGHT_COMPUTE_S1
+                           RIGHT_RESULT_S1_RELU "0:                         \n"
+                       : [dout_ptr1] "+r"(doutr0),
+                         [dout_ptr2] "+r"(doutr1),
+                         [din0_ptr] "+r"(din_ptr0),
+                         [din1_ptr] "+r"(din_ptr1),
+                         [din2_ptr] "+r"(din_ptr2),
+                         [din3_ptr] "+r"(din_ptr3),
+                         [cnt] "+r"(cnt),
+                         [rmask] "+r"(rmask_ptr),
+                         [vmask] "+r"(vmask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias_val] "r"(bias_val),
+                         [vzero] "w"(vzero),
+                         [remain] "r"(remain)
+                       : "cc",
+                         "memory",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        } else {
+          asm volatile(INIT_S1
+                       "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "vext.32  q6, q8, q9, #1     @ 0012\n"
+                       "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
+                           MID_RESULT_S1
+                       "cmp  %[remain], #1             \n"
+                       "blt 0f                         \n" RIGHT_COMPUTE_S1
+                           RIGHT_RESULT_S1 "0:                         \n"
+                       : [dout_ptr1] "+r"(doutr0),
+                         [dout_ptr2] "+r"(doutr1),
+                         [din0_ptr] "+r"(din_ptr0),
+                         [din1_ptr] "+r"(din_ptr1),
+                         [din2_ptr] "+r"(din_ptr2),
+                         [din3_ptr] "+r"(din_ptr3),
+                         [cnt] "+r"(cnt),
+                         [rmask] "+r"(rmask_ptr),
+                         [vmask] "+r"(vmask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias_val] "r"(bias_val),
+                         [vzero] "w"(vzero),
+                         [remain] "r"(remain)
+                       : "cc",
+                         "memory",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+        dout_ptr += 2 * w_out;
+      }  //! end of processing mid rows
+#endif
+    }
+  }
+}
+/**
+ * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
+ * width <= 4
+ */
+void conv_depthwise_3x3s1p0_bias_s_relu(float *dout,
+                                        const float *din,
+                                        const float *weights,
+                                        const float *bias,
+                                        bool flag_bias,
+                                        bool flag_relu,
+                                        const int num,
+                                        const int ch_in,
+                                        const int h_in,
+                                        const int w_in,
+                                        const int h_out,
+                                        const int w_out,
+                                        ARMContext *ctx) {
+  //! 3x3s1 convolution, implemented by direct algorithm
+  //! pad is done implicit
+  //! for 4x6 convolution window
+  const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
+  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  uint32x4_t vmask_rp1 =
+      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in));
+  uint32x4_t vmask_rp2 =
+      vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in));
+
+  unsigned int vmask[8];
+  vst1q_u32(vmask, vmask_rp1);
+  vst1q_u32(vmask + 4, vmask_rp2);
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      float *dout_channel = dout_batch + i * size_out_channel;
+      const float *din_channel = din_batch + i * size_in_channel;
+      const float *weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+#ifdef __aarch64__
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+#endif  // __aarch64__
+
+      float out_buf1[4];
+      float out_buf2[4];
+      float trash_buf[4];
+
+      float *doutr0 = dout_channel;
+      float *doutr1 = dout_channel + w_out;
+
+      for (int j = 0; j < h_out; j += 2) {
+        const float *dr0 = din_channel + j * w_in;
+        const float *dr1 = dr0 + w_in;
+        const float *dr2 = dr1 + w_in;
+        const float *dr3 = dr2 + w_in;
+
+        doutr0 = dout_channel + j * w_out;
+        doutr1 = doutr0 + w_out;
+
+        if (j + 3 >= h_in) {
+          switch (j + 3 - h_in) {
+            case 3:
+              dr1 = zero_ptr;
+            case 2:
+              dr2 = zero_ptr;
+            case 1:
+              dr3 = zero_ptr;
+              doutr1 = trash_buf;
+            case 0:
+              dr3 = zero_ptr;
+              if (j + 2 > h_out) {
+                doutr1 = trash_buf;
+              }
+            default:
+              break;
+          }
+        }
+#ifdef __aarch64__
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [vbias] "w"(wbias),
+                         [mask1] "w"(vmask_rp1),
+                         [mask2] "w"(vmask_rp2),
+                         [zero] "w"(vzero),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "cc",
+                         "memory",
+                         "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15");
+        } else {
+          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [vbias] "w"(wbias),
+                         [mask1] "w"(vmask_rp1),
+                         [mask2] "w"(vmask_rp2),
+                         [zero] "w"(vzero),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "cc",
+                         "memory",
+                         "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15");
+        }
+#else
+        unsigned int *vmask_ptr = vmask;
+        float bias_val = flag_bias ? bias[i] : 0.f;
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3),
+                         [vmask] "+r"(vmask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [vzero] "w"(vzero),
+                         [bias_val] "r"(bias_val),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "cc",
+                         "memory",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        } else {
+          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3),
+                         [vmask] "+r"(vmask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [vzero] "w"(vzero),
+                         [bias_val] "r"(bias_val),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "cc",
+                         "memory",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *doutr0++ = out_buf1[w];
+          *doutr1++ = out_buf2[w];
+        }
+      }  // end of processing heights
+    }    // end of processing channels
+  }      // end of processing batchs
+}
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
index 55ea94949ba93396c97be5e3ea66d6e29ce95429..c998ddc3a34c2f6194a5156b7d04b7a9db3fbcef 100644
--- a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
@@ -508,6 +508,8 @@ void act_switch_3x3s1(const float* inr0,
                        "x5",
                        "x6",
                        "x7");
+#else
+#if 1  // def LITE_WITH_ARM_CLANG
 #else
         asm volatile(COMPUTE RELU STORE
                      : [r0] "+r"(inr0),
@@ -541,6 +543,7 @@ void act_switch_3x3s1(const float* inr0,
                        "r3",
                        "r4",
                        "r5");
+#endif
 #endif
         break;
       case lite_api::ActivationType::kRelu6:
@@ -593,6 +596,8 @@ void act_switch_3x3s1(const float* inr0,
                        "x5",
                        "x6",
                        "x7");
+#else
+#if 1  // def LITE_WITH_ARM_CLANG
 #else
         asm volatile(COMPUTE RELU RELU6 STORE
                      : [r0] "+r"(inr0),
@@ -626,6 +631,7 @@ void act_switch_3x3s1(const float* inr0,
                        "r3",
                        "r4",
                        "r5");
+#endif
 #endif
         break;
       case lite_api::ActivationType::kLeakyRelu:
@@ -678,6 +684,8 @@ void act_switch_3x3s1(const float* inr0,
                        "x5",
                        "x6",
                        "x7");
+#else
+#if 1  // def LITE_WITH_ARM_CLANG
 #else
         asm volatile(COMPUTE LEAKY_RELU STORE
                      : [r0] "+r"(inr0),
@@ -711,6 +719,7 @@ void act_switch_3x3s1(const float* inr0,
                        "r3",
                        "r4",
                        "r5");
+#endif
 #endif
         break;
       default:
@@ -768,6 +777,8 @@ void act_switch_3x3s1(const float* inr0,
                    "x5",
                    "x6",
                    "x7");
+#else
+#if 1  // def LITE_WITH_ARM_CLANG
 #else
     asm volatile(COMPUTE STORE
                  : [r0] "+r"(inr0),
@@ -801,6 +812,7 @@ void act_switch_3x3s1(const float* inr0,
                    "r3",
                    "r4",
                    "r5");
+#endif
 #endif
   }
 }
@@ -988,6 +1000,8 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
                            w8,
                            vbias,
                            act_param);
+#else
+#if 1  // def LITE_WITH_ARM_CLANG
 #else
           act_switch_3x3s1(inr0,
                            inr1,
@@ -1008,6 +1022,7 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
                            vbias,
                            vbias,
                            act_param);
+#endif
 #endif
           outl[0] += 4;
           outl[1] += 4;
diff --git a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
index 3e5569365119b97397c6d42f48bacd2552b248e5..d2e8f66a609d44d2c69228f3b9a343fdf91296a8 100644
--- a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
@@ -91,23 +91,20 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                bool flag_bias,
                                const operators::ActivationParam act_param,
                                ARMContext* ctx) {
-  if (pad == 0) {
-    if (w_in > 7) {
-      conv_depthwise_3x3s2p0_bias(dout,
-                                  din,
-                                  weights,
-                                  bias,
-                                  flag_bias,
-                                  num,
-                                  ch_in,
-                                  h_in,
-                                  w_in,
-                                  h_out,
-                                  w_out,
-                                  act_param,
-                                  ctx);
+  bool has_active = act_param.has_active;
+  bool flag_relu = false;
+  bool relu6 = false;
+  if (has_active) {
+    if (act_param.active_type == lite_api::ActivationType::kRelu) {
+      flag_relu = true;
     } else {
-      conv_depthwise_3x3s2p0_bias_s(dout,
+      relu6 = true;
+    }
+  }
+  if (pad == 0) {
+    if (w_in > 8) {
+      if (relu6) {
+        conv_depthwise_3x3s2p0_bias(dout,
                                     din,
                                     weights,
                                     bias,
@@ -120,25 +117,57 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                     w_out,
                                     act_param,
                                     ctx);
+      } else {
+        conv_depthwise_3x3s2p0_bias_relu(dout,
+                                         din,
+                                         weights,
+                                         bias,
+                                         flag_bias,
+                                         flag_relu,
+                                         num,
+                                         ch_in,
+                                         h_in,
+                                         w_in,
+                                         h_out,
+                                         w_out,
+                                         ctx);
+      }
+    } else {
+      if (relu6) {
+        conv_depthwise_3x3s2p0_bias_s(dout,
+                                      din,
+                                      weights,
+                                      bias,
+                                      flag_bias,
+                                      num,
+                                      ch_in,
+                                      h_in,
+                                      w_in,
+                                      h_out,
+                                      w_out,
+                                      act_param,
+                                      ctx);
+      } else {
+        conv_depthwise_3x3s2p0_bias_s_relu(dout,
+                                           din,
+                                           weights,
+                                           bias,
+                                           flag_bias,
+                                           flag_relu,
+                                           num,
+                                           ch_in,
+                                           h_in,
+                                           w_in,
+                                           h_out,
+                                           w_out,
+                                           ctx);
+      }
     }
   }
   if (pad == 1) {
     if (w_in > 7) {
-      conv_depthwise_3x3s2p1_bias(dout,
-                                  din,
-                                  weights,
-                                  bias,
-                                  flag_bias,
-                                  num,
-                                  ch_in,
-                                  h_in,
-                                  w_in,
-                                  h_out,
-                                  w_out,
-                                  act_param,
-                                  ctx);
-    } else {
-      conv_depthwise_3x3s2p1_bias_s(dout,
+      if (relu6) {
+        conv_depthwise_3x3s2p1_bias(dout,
                                     din,
                                     weights,
                                     bias,
@@ -151,6 +180,51 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                     w_out,
                                     act_param,
                                     ctx);
+      } else {
+        conv_depthwise_3x3s2p1_bias_relu(dout,
+                                         din,
+                                         weights,
+                                         bias,
+                                         flag_bias,
+                                         flag_relu,
+                                         num,
+                                         ch_in,
+                                         h_in,
+                                         w_in,
+                                         h_out,
+                                         w_out,
+                                         ctx);
+      }
+    } else {
+      if (relu6) {
+        conv_depthwise_3x3s2p1_bias_s(dout,
+                                      din,
+                                      weights,
+                                      bias,
+                                      flag_bias,
+                                      num,
+                                      ch_in,
+                                      h_in,
+                                      w_in,
+                                      h_out,
+                                      w_out,
+                                      act_param,
+                                      ctx);
+      } else {
+        conv_depthwise_3x3s2p1_bias_s_relu(dout,
+                                           din,
+                                           weights,
+                                           bias,
+                                           flag_bias,
+                                           flag_relu,
+                                           num,
+                                           ch_in,
+                                           h_in,
+                                           w_in,
+                                           h_out,
+                                           w_out,
+                                           ctx);
+      }
     }
   }
 }
@@ -476,7 +550,7 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                                           \
   "st1 {v16.4s}, [%[outptr0]], #16              \n"       \
   "fcmge v11.4s, v17.4s,  %[vzero].4s \n" /* vcgeq_u32 */ \
-  "fmul v12.4s, v16.4s, v22.4s                  \n"       \
+  "fmul v12.4s, v17.4s, v22.4s                  \n"       \
                                                           \
   "ld1 {v20.4s}, [%[inptr3]]                 \n"          \
   "ld1 {v21.4s}, [%[inptr4]]                 \n"          \
@@ -552,6 +626,7 @@ void conv_depthwise_3x3s2_fp32(const float* din,
   "ld1 {v20.4s}, [%[inptr3]]                 \n"          \
   "ld1 {v21.4s}, [%[inptr4]]                 \n"          \
                                                           \
+  "fadd v17.4s, v17.4s, v14.4s                  \n"       \
   "bif  v16.16b, v12.16b, v11.16b \n" /* choose*/         \
   "ext  v10.16b, v0.16b, v15.16b, #4     \n"              \
   "fcmge v11.4s, v17.4s,  %[vzero].4s \n" /* vcgeq_u32 */ \
@@ -977,207 +1052,158 @@ void act_switch_3x3s2p1(const float* din0_ptr,
                         int cnt,
                         int cnt_remain,
                         const operators::ActivationParam act_param) {
-  bool has_active = act_param.has_active;
-  if (has_active) {
-    float tmp = act_param.Relu_clipped_coef;
-    float ss = act_param.Leaky_relu_alpha;
-    float vsix[4] = {tmp, tmp, tmp, tmp};
-    float vscale[4] = {ss, ss, ss, ss};
-
-    switch (act_param.active_type) {
-      case lite_api::ActivationType::kRelu:
-        asm volatile(
-            INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
-                MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
-            : [inptr0] "+r"(din0_ptr),
-              [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr),
-              [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr),
-              [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr),
-              [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero),
-              [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [remain] "r"(cnt_remain),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [wmask] "w"(wmask),
-              [vbias] "w"(wbias)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21");
-        break;
-      case lite_api::ActivationType::kRelu6:
-        /* 0 <= din <= 6 */
-        asm volatile(
-            INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU6 MID_COMPUTE_S2
-                MID_RESULT_S2_RELU6 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU6
-            : [inptr0] "+r"(din0_ptr),
-              [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr),
-              [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr),
-              [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr),
-              [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero),
-              [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [remain] "r"(cnt_remain),
-              [six_ptr] "r"(vsix),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [wmask] "w"(wmask),
-              [vbias] "w"(wbias)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22");
-        break;
-      case lite_api::ActivationType::kLeakyRelu:
-        /*din = din >= 0 ? din : din * scale*/
-        asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_LEAKY_RELU
-                         MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU
-                             RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_LEAKY_RELU
-                     : [inptr0] "+r"(din0_ptr),
-                       [inptr1] "+r"(din1_ptr),
-                       [inptr2] "+r"(din2_ptr),
-                       [inptr3] "+r"(din3_ptr),
-                       [inptr4] "+r"(din4_ptr),
-                       [outptr0] "+r"(doutr0_ptr),
-                       [outptr1] "+r"(doutr1_ptr),
-                       [cnt] "+r"(cnt)
-                     : [vzero] "w"(vzero),
-                       [w0] "w"(wr0),
-                       [w1] "w"(wr1),
-                       [w2] "w"(wr2),
-                       [remain] "r"(cnt_remain),
-                       [scale_ptr] "r"(vscale),
-                       [mask1] "w"(vmask_rp1),
-                       [mask2] "w"(vmask_rp2),
-                       [wmask] "w"(wmask),
-                       [vbias] "w"(wbias)
-                     : "cc",
-                       "memory",
-                       "v0",
-                       "v1",
-                       "v2",
-                       "v3",
-                       "v4",
-                       "v5",
-                       "v6",
-                       "v7",
-                       "v8",
-                       "v9",
-                       "v10",
-                       "v11",
-                       "v12",
-                       "v13",
-                       "v14",
-                       "v15",
-                       "v16",
-                       "v17",
-                       "v18",
-                       "v19",
-                       "v20",
-                       "v21",
-                       "v22");
-        break;
-      default:
-        LOG(FATAL) << "this act_type: "
-                   << static_cast<int>(act_param.active_type)
-                   << " fuse not support";
-    }
-  } else {
-    asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
-                     MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
-                 : [inptr0] "+r"(din0_ptr),
-                   [inptr1] "+r"(din1_ptr),
-                   [inptr2] "+r"(din2_ptr),
-                   [inptr3] "+r"(din3_ptr),
-                   [inptr4] "+r"(din4_ptr),
-                   [outptr0] "+r"(doutr0_ptr),
-                   [outptr1] "+r"(doutr1_ptr),
-                   [cnt] "+r"(cnt)
-                 : [vzero] "w"(vzero),
-                   [w0] "w"(wr0),
-                   [w1] "w"(wr1),
-                   [w2] "w"(wr2),
-                   [remain] "r"(cnt_remain),
-                   [mask1] "w"(vmask_rp1),
-                   [mask2] "w"(vmask_rp2),
-                   [wmask] "w"(wmask),
-                   [vbias] "w"(wbias)
-                 : "cc",
-                   "memory",
-                   "v0",
-                   "v1",
-                   "v2",
-                   "v3",
-                   "v4",
-                   "v5",
-                   "v6",
-                   "v7",
-                   "v8",
-                   "v9",
-                   "v10",
-                   "v11",
-                   "v12",
-                   "v13",
-                   "v14",
-                   "v15",
-                   "v16",
-                   "v17",
-                   "v18",
-                   "v19",
-                   "v20",
-                   "v21");
+  float tmp = act_param.Relu_clipped_coef;
+  float ss = act_param.Leaky_relu_alpha;
+  float vsix[4] = {tmp, tmp, tmp, tmp};
+  float vscale[4] = {ss, ss, ss, ss};
+
+  switch (act_param.active_type) {
+    case lite_api::ActivationType::kRelu:
+      asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
+                       MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
+                   : [inptr0] "+r"(din0_ptr),
+                     [inptr1] "+r"(din1_ptr),
+                     [inptr2] "+r"(din2_ptr),
+                     [inptr3] "+r"(din3_ptr),
+                     [inptr4] "+r"(din4_ptr),
+                     [outptr0] "+r"(doutr0_ptr),
+                     [outptr1] "+r"(doutr1_ptr),
+                     [cnt] "+r"(cnt)
+                   : [vzero] "w"(vzero),
+                     [w0] "w"(wr0),
+                     [w1] "w"(wr1),
+                     [w2] "w"(wr2),
+                     [remain] "r"(cnt_remain),
+                     [mask1] "w"(vmask_rp1),
+                     [mask2] "w"(vmask_rp2),
+                     [wmask] "w"(wmask),
+                     [vbias] "w"(wbias)
+                   : "cc",
+                     "memory",
+                     "v0",
+                     "v1",
+                     "v2",
+                     "v3",
+                     "v4",
+                     "v5",
+                     "v6",
+                     "v7",
+                     "v8",
+                     "v9",
+                     "v10",
+                     "v11",
+                     "v12",
+                     "v13",
+                     "v14",
+                     "v15",
+                     "v16",
+                     "v17",
+                     "v18",
+                     "v19",
+                     "v20",
+                     "v21");
+      break;
+    case lite_api::ActivationType::kRelu6:
+      /* 0 <= din <= 6 */
+      asm volatile(
+          INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU6 MID_COMPUTE_S2
+              MID_RESULT_S2_RELU6 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU6
+          : [inptr0] "+r"(din0_ptr),
+            [inptr1] "+r"(din1_ptr),
+            [inptr2] "+r"(din2_ptr),
+            [inptr3] "+r"(din3_ptr),
+            [inptr4] "+r"(din4_ptr),
+            [outptr0] "+r"(doutr0_ptr),
+            [outptr1] "+r"(doutr1_ptr),
+            [cnt] "+r"(cnt)
+          : [vzero] "w"(vzero),
+            [w0] "w"(wr0),
+            [w1] "w"(wr1),
+            [w2] "w"(wr2),
+            [remain] "r"(cnt_remain),
+            [six_ptr] "r"(vsix),
+            [mask1] "w"(vmask_rp1),
+            [mask2] "w"(vmask_rp2),
+            [wmask] "w"(wmask),
+            [vbias] "w"(wbias)
+          : "cc",
+            "memory",
+            "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22");
+      break;
+    case lite_api::ActivationType::kLeakyRelu:
+      /*din = din >= 0 ? din : din * scale*/
+      asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_LEAKY_RELU
+                       MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU RIGHT_COMPUTE_S2
+                           RIGHT_RESULT_S2_LEAKY_RELU
+                   : [inptr0] "+r"(din0_ptr),
+                     [inptr1] "+r"(din1_ptr),
+                     [inptr2] "+r"(din2_ptr),
+                     [inptr3] "+r"(din3_ptr),
+                     [inptr4] "+r"(din4_ptr),
+                     [outptr0] "+r"(doutr0_ptr),
+                     [outptr1] "+r"(doutr1_ptr),
+                     [cnt] "+r"(cnt)
+                   : [vzero] "w"(vzero),
+                     [w0] "w"(wr0),
+                     [w1] "w"(wr1),
+                     [w2] "w"(wr2),
+                     [remain] "r"(cnt_remain),
+                     [scale_ptr] "r"(vscale),
+                     [mask1] "w"(vmask_rp1),
+                     [mask2] "w"(vmask_rp2),
+                     [wmask] "w"(wmask),
+                     [vbias] "w"(wbias)
+                   : "cc",
+                     "memory",
+                     "v0",
+                     "v1",
+                     "v2",
+                     "v3",
+                     "v4",
+                     "v5",
+                     "v6",
+                     "v7",
+                     "v8",
+                     "v9",
+                     "v10",
+                     "v11",
+                     "v12",
+                     "v13",
+                     "v14",
+                     "v15",
+                     "v16",
+                     "v17",
+                     "v18",
+                     "v19",
+                     "v20",
+                     "v21",
+                     "v22");
+      break;
+    default:
+      LOG(FATAL) << "this act_type: " << static_cast<int>(act_param.active_type)
+                 << " fuse not support";
   }
 }
 #endif
@@ -1569,249 +1595,191 @@ void act_switch_3x3s2p0(const float* din0_ptr,
                         int cnt,
                         int cnt_remain,
                         const operators::ActivationParam act_param) {
-  bool has_active = act_param.has_active;
-  if (has_active) {
-    float tmp = act_param.Relu_clipped_coef;
-    float ss = act_param.Leaky_relu_alpha;
-    float vsix[4] = {tmp, tmp, tmp, tmp};
-    float vscale[4] = {ss, ss, ss, ss};
-
-    switch (act_param.active_type) {
-      case lite_api::ActivationType::kRelu:
-        asm volatile(
-            INIT_S2
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-            MID_COMPUTE_S2 MID_RESULT_S2_RELU
-            "cmp %w[remain], #1                           \n"
-            "blt 4f                                     \n" RIGHT_COMPUTE_S2
-                RIGHT_RESULT_S2_RELU
-            "4:                                          \n"
-            : [inptr0] "+r"(din0_ptr),
-              [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr),
-              [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr),
-              [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr),
-              [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero),
-              [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [remain] "r"(cnt_remain),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [wmask] "w"(wmask),
-              [vbias] "w"(wbias)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21");
-        break;
-      case lite_api::ActivationType::kRelu6:
-        /* 0 <= din <= 6 */
-        asm volatile(
-            INIT_S2
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-            "ld1 {v22.4s}, [%[six_ptr]]                  \n" MID_COMPUTE_S2
-                MID_RESULT_S2_RELU6
-            "cmp %w[remain], #1                           \n"
-            "blt 4f                                     \n" RIGHT_COMPUTE_S2
-                RIGHT_RESULT_S2_RELU6
-            "4:                                          \n"
-            : [inptr0] "+r"(din0_ptr),
-              [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr),
-              [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr),
-              [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr),
-              [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero),
-              [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [remain] "r"(cnt_remain),
-              [six_ptr] "r"(vsix),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [wmask] "w"(wmask),
-              [vbias] "w"(wbias)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22");
-        break;
-      case lite_api::ActivationType::kLeakyRelu:
-        /*din = din >= 0 ? din : din * scale*/
-        asm volatile(
-            INIT_S2
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-            "ld1 {v22.4s}, [%[scale_ptr]]                  \n" MID_COMPUTE_S2
-                MID_RESULT_S2_LEAKY_RELU
-            "cmp %w[remain], #1                           \n"
-            "blt 4f                                     \n" RIGHT_COMPUTE_S2
-                RIGHT_RESULT_S2_LEAKY_RELU
-            "4:                                          \n"
-            : [inptr0] "+r"(din0_ptr),
-              [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr),
-              [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr),
-              [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr),
-              [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero),
-              [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [remain] "r"(cnt_remain),
-              [scale_ptr] "r"(vscale),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [wmask] "w"(wmask),
-              [vbias] "w"(wbias)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22");
-        break;
-      default:
-        LOG(FATAL) << "this act_type: "
-                   << static_cast<int>(act_param.active_type)
-                   << " fuse not support";
-    }
-  } else {
-    asm volatile(
-        INIT_S2
-        "ld1 {v15.4s}, [%[inptr0]]                 \n"
-        "ld1 {v18.4s}, [%[inptr1]]                 \n"
-        "ld1 {v19.4s}, [%[inptr2]]                 \n"
-        "ld1 {v20.4s}, [%[inptr3]]                 \n"
-        "ld1 {v21.4s}, [%[inptr4]]                 \n"
-        "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-        MID_COMPUTE_S2 MID_RESULT_S2
-        "cmp %w[remain], #1                           \n"
-        "blt 4f                                     \n" RIGHT_COMPUTE_S2
-            RIGHT_RESULT_S2 "4:                                          \n"
-        : [inptr0] "+r"(din0_ptr),
-          [inptr1] "+r"(din1_ptr),
-          [inptr2] "+r"(din2_ptr),
-          [inptr3] "+r"(din3_ptr),
-          [inptr4] "+r"(din4_ptr),
-          [outptr0] "+r"(doutr0_ptr),
-          [outptr1] "+r"(doutr1_ptr),
-          [cnt] "+r"(cnt)
-        : [vzero] "w"(vzero),
-          [w0] "w"(wr0),
-          [w1] "w"(wr1),
-          [w2] "w"(wr2),
-          [remain] "r"(cnt_remain),
-          [mask1] "w"(vmask_rp1),
-          [mask2] "w"(vmask_rp2),
-          [wmask] "w"(wmask),
-          [vbias] "w"(wbias)
-        : "cc",
-          "memory",
-          "v0",
-          "v1",
-          "v2",
-          "v3",
-          "v4",
-          "v5",
-          "v6",
-          "v7",
-          "v8",
-          "v9",
-          "v10",
-          "v11",
-          "v12",
-          "v13",
-          "v14",
-          "v15",
-          "v16",
-          "v17",
-          "v18",
-          "v19",
-          "v20",
-          "v21");
+  float tmp = act_param.Relu_clipped_coef;
+  float ss = act_param.Leaky_relu_alpha;
+  float vsix[4] = {tmp, tmp, tmp, tmp};
+  float vscale[4] = {ss, ss, ss, ss};
+
+  switch (act_param.active_type) {
+    case lite_api::ActivationType::kRelu:
+      asm volatile(
+          INIT_S2
+          "ld1 {v15.4s}, [%[inptr0]]                 \n"
+          "ld1 {v18.4s}, [%[inptr1]]                 \n"
+          "ld1 {v19.4s}, [%[inptr2]]                 \n"
+          "ld1 {v20.4s}, [%[inptr3]]                 \n"
+          "ld1 {v21.4s}, [%[inptr4]]                 \n"
+          "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+          MID_COMPUTE_S2 MID_RESULT_S2_RELU
+          "cmp %w[remain], #1                           \n"
+          "blt 4f                                     \n" RIGHT_COMPUTE_S2
+              RIGHT_RESULT_S2_RELU
+          "4:                                          \n"
+          : [inptr0] "+r"(din0_ptr),
+            [inptr1] "+r"(din1_ptr),
+            [inptr2] "+r"(din2_ptr),
+            [inptr3] "+r"(din3_ptr),
+            [inptr4] "+r"(din4_ptr),
+            [outptr0] "+r"(doutr0_ptr),
+            [outptr1] "+r"(doutr1_ptr),
+            [cnt] "+r"(cnt)
+          : [vzero] "w"(vzero),
+            [w0] "w"(wr0),
+            [w1] "w"(wr1),
+            [w2] "w"(wr2),
+            [remain] "r"(cnt_remain),
+            [mask1] "w"(vmask_rp1),
+            [mask2] "w"(vmask_rp2),
+            [wmask] "w"(wmask),
+            [vbias] "w"(wbias)
+          : "cc",
+            "memory",
+            "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21");
+      break;
+    case lite_api::ActivationType::kRelu6:
+      /* 0 <= din <= 6 */
+      asm volatile(
+          INIT_S2
+          "ld1 {v15.4s}, [%[inptr0]]                 \n"
+          "ld1 {v18.4s}, [%[inptr1]]                 \n"
+          "ld1 {v19.4s}, [%[inptr2]]                 \n"
+          "ld1 {v20.4s}, [%[inptr3]]                 \n"
+          "ld1 {v21.4s}, [%[inptr4]]                 \n"
+          "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+          "ld1 {v22.4s}, [%[six_ptr]]                  \n" MID_COMPUTE_S2
+              MID_RESULT_S2_RELU6
+          "cmp %w[remain], #1                           \n"
+          "blt 4f                                     \n" RIGHT_COMPUTE_S2
+              RIGHT_RESULT_S2_RELU6
+          "4:                                          \n"
+          : [inptr0] "+r"(din0_ptr),
+            [inptr1] "+r"(din1_ptr),
+            [inptr2] "+r"(din2_ptr),
+            [inptr3] "+r"(din3_ptr),
+            [inptr4] "+r"(din4_ptr),
+            [outptr0] "+r"(doutr0_ptr),
+            [outptr1] "+r"(doutr1_ptr),
+            [cnt] "+r"(cnt)
+          : [vzero] "w"(vzero),
+            [w0] "w"(wr0),
+            [w1] "w"(wr1),
+            [w2] "w"(wr2),
+            [remain] "r"(cnt_remain),
+            [six_ptr] "r"(vsix),
+            [mask1] "w"(vmask_rp1),
+            [mask2] "w"(vmask_rp2),
+            [wmask] "w"(wmask),
+            [vbias] "w"(wbias)
+          : "cc",
+            "memory",
+            "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22");
+      break;
+    case lite_api::ActivationType::kLeakyRelu:
+      /*din = din >= 0 ? din : din * scale*/
+      asm volatile(
+          INIT_S2
+          "ld1 {v15.4s}, [%[inptr0]]                 \n"
+          "ld1 {v18.4s}, [%[inptr1]]                 \n"
+          "ld1 {v19.4s}, [%[inptr2]]                 \n"
+          "ld1 {v20.4s}, [%[inptr3]]                 \n"
+          "ld1 {v21.4s}, [%[inptr4]]                 \n"
+          "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+          "ld1 {v22.4s}, [%[scale_ptr]]                  \n" MID_COMPUTE_S2
+              MID_RESULT_S2_LEAKY_RELU
+          "cmp %w[remain], #1                           \n"
+          "blt 4f                                     \n" RIGHT_COMPUTE_S2
+              RIGHT_RESULT_S2_LEAKY_RELU
+          "4:                                          \n"
+          : [inptr0] "+r"(din0_ptr),
+            [inptr1] "+r"(din1_ptr),
+            [inptr2] "+r"(din2_ptr),
+            [inptr3] "+r"(din3_ptr),
+            [inptr4] "+r"(din4_ptr),
+            [outptr0] "+r"(doutr0_ptr),
+            [outptr1] "+r"(doutr1_ptr),
+            [cnt] "+r"(cnt)
+          : [vzero] "w"(vzero),
+            [w0] "w"(wr0),
+            [w1] "w"(wr1),
+            [w2] "w"(wr2),
+            [remain] "r"(cnt_remain),
+            [scale_ptr] "r"(vscale),
+            [mask1] "w"(vmask_rp1),
+            [mask2] "w"(vmask_rp2),
+            [wmask] "w"(wmask),
+            [vbias] "w"(wbias)
+          : "cc",
+            "memory",
+            "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22");
+      break;
+    default:
+      LOG(FATAL) << "this act_type: " << static_cast<int>(act_param.active_type)
+                 << " fuse not support";
   }
 }
 #endif
diff --git a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32_relu.cc b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32_relu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2f0243279fd1be27349bfeb97a3a61eed3eff4d
--- /dev/null
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32_relu.cc
@@ -0,0 +1,1735 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include "lite/backends/arm/math/conv_depthwise.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+#ifdef __aarch64__
+#define INIT_S2                                  \
+  "prfm pldl1keep, [%[inptr0]]             \n"   \
+  "prfm pldl1keep, [%[inptr1]]             \n"   \
+  "prfm pldl1keep, [%[inptr2]]             \n"   \
+  "prfm pldl1keep, [%[inptr3]]             \n"   \
+  "prfm pldl1keep, [%[inptr4]]             \n"   \
+  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  \
+  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"  \
+  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"  \
+  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"  \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"  \
+                                                 \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n" \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"
+
+#define LEFT_COMPUTE_S2                                                   \
+  "ext  v10.16b, %[vzero].16b, v1.16b, #12     \n" /* r0 */               \
+  "fmul v11.4s, v0.4s, %[w0].s[1]            \n"   /*  {0,2,4,6} * w01 */ \
+  "fmul v12.4s, v1.4s, %[w0].s[2]            \n"   /* {1,3,5,7} * w02 */  \
+  "fmla v16.4s, v10.4s, %[w0].s[0]            \n"  /* {0,1,3,5} * w00*/   \
+                                                                          \
+  "ext  v10.16b, %[vzero].16b, v3.16b, #12     \n" /* v10 = {0,1,3,5} */  \
+                                                                          \
+  "sub %[inptr0], %[inptr0], #4            \n"                            \
+  "sub %[inptr1], %[inptr1], #4             \n" /* r1 */                  \
+  "fmla v11.4s, v2.4s, %[w1].s[1]            \n"                          \
+  "fmla v12.4s, v3.4s, %[w1].s[2]            \n"                          \
+  "fmla v16.4s, v10.4s, %[w1].s[0]            \n"                         \
+                                                                          \
+  "ext  v10.16b, %[vzero].16b, v5.16b, #12     \n"                        \
+                                                                          \
+  "sub %[inptr2], %[inptr2], #4            \n"                            \
+  "sub %[inptr3], %[inptr3], #4             \n" /* r2 */                  \
+  "fmul v13.4s, v4.4s, %[w0].s[1]            \n"                          \
+  "fmla v11.4s, v4.4s, %[w2].s[1]            \n"                          \
+                                                                          \
+  "fmul v14.4s, v5.4s, %[w0].s[2]            \n"                          \
+  "fmla v12.4s, v5.4s, %[w2].s[2]            \n"                          \
+                                                                          \
+  "fmla v17.4s, v10.4s, %[w0].s[0]            \n"                         \
+  "fmla v16.4s, v10.4s, %[w2].s[0]            \n"                         \
+                                                                          \
+  "ext  v10.16b, %[vzero].16b, v7.16b, #12     \n"                        \
+                                                                          \
+  "sub %[inptr4], %[inptr4], #4            \n" /* r3 */                   \
+  "fmla v13.4s, v6.4s, %[w1].s[1]            \n"                          \
+  "fmla v14.4s, v7.4s, %[w1].s[2]            \n"                          \
+  "fmla v17.4s, v10.4s, %[w1].s[0]            \n"                         \
+                                                                          \
+  "ext  v10.16b, %[vzero].16b, v9.16b, #12     \n"                        \
+  "fadd v16.4s, v16.4s, v11.4s                  \n"                       \
+  "fadd v16.4s, v16.4s, v12.4s                  \n"
+
+#define LEFT_RESULT_S2                              \
+  /* r4 */                                          \
+  "fmla v13.4s, v8.4s, %[w2].s[1]            \n"    \
+  "fmla v14.4s, v9.4s, %[w2].s[2]            \n"    \
+  "fmla v17.4s, v10.4s, %[w2].s[0]            \n"   \
+                                                    \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n" \
+                                                    \
+  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"     \
+  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"     \
+  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"     \
+                                                    \
+  "fadd v17.4s, v17.4s, v13.4s                  \n" \
+                                                    \
+  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"     \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"     \
+  "ld1 {v15.4s}, [%[inptr0]]                 \n"    \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"    \
+                                                    \
+  "fadd v17.4s, v17.4s, v14.4s                  \n" \
+                                                    \
+  "ld1 {v18.4s}, [%[inptr1]]                 \n"    \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"    \
+                                                    \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"        \
+                                                    \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"    \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"    \
+                                                    \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n" \
+                                                    \
+  "cmp %w[cnt], #1                             \n"  \
+                                                    \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"    \
+                                                    \
+  "blt 1f                                     \n"
+
+#define MID_COMPUTE_S2                                      \
+  "2:                                          \n" /* r0 */ \
+  "fmul v11.4s, v0.4s, %[w0].s[0]            \n"            \
+  "fmul v12.4s, v1.4s, %[w0].s[1]            \n"            \
+  "fmla v16.4s, v10.4s, %[w0].s[2]            \n"           \
+                                                            \
+  "ext  v10.16b, v2.16b, v18.16b, #4     \n"                \
+  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" /* r1 */    \
+  "fmla v11.4s, v2.4s, %[w1].s[0]            \n"            \
+  "fmla v12.4s, v3.4s, %[w1].s[1]            \n"            \
+  "fmla v16.4s, v10.4s, %[w1].s[2]            \n"           \
+                                                            \
+  "ext  v10.16b, v4.16b, v19.16b, #4     \n"                \
+                                                            \
+  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n" /* r2 */    \
+  "fmul v13.4s, v4.4s, %[w0].s[0]            \n"            \
+  "fmla v11.4s, v4.4s, %[w2].s[0]            \n"            \
+                                                            \
+  "fmul v14.4s, v5.4s, %[w0].s[1]            \n"            \
+  "fmla v12.4s, v5.4s, %[w2].s[1]            \n"            \
+                                                            \
+  "fmla v17.4s, v10.4s, %[w0].s[2]            \n"           \
+  "fmla v16.4s, v10.4s, %[w2].s[2]            \n"           \
+                                                            \
+  "ext  v10.16b, v6.16b, v20.16b, #4     \n"                \
+                                                            \
+  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n" /* r3 */    \
+  "fmla v13.4s, v6.4s, %[w1].s[0]            \n"            \
+  "fmla v14.4s, v7.4s, %[w1].s[1]            \n"            \
+  "fmla v17.4s, v10.4s, %[w1].s[2]            \n"           \
+                                                            \
+  "ext  v10.16b, v8.16b, v21.16b, #4     \n"                \
+                                                            \
+  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"             \
+                                                            \
+  "fadd v16.4s, v16.4s, v11.4s                  \n"         \
+  "fadd v16.4s, v16.4s, v12.4s                  \n"
+
+#define MID_RESULT_S2                               \
+  /* r4 */                                          \
+  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"    \
+  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"    \
+  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"   \
+                                                    \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"     \
+  "ld1 {v15.4s}, [%[inptr0]]                 \n"    \
+  "ld1 {v18.4s}, [%[inptr1]]                 \n"    \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n" \
+                                                    \
+  "fadd v17.4s, v17.4s, v13.4s                  \n" \
+                                                    \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"    \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"    \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"    \
+                                                    \
+  "fadd v17.4s, v17.4s, v14.4s                  \n" \
+                                                    \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"        \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"    \
+  "subs %w[cnt], %w[cnt], #1                    \n" \
+                                                    \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n" \
+                                                    \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"    \
+                                                    \
+  "bne  2b                                    \n"
+
+#define RIGHT_COMPUTE_S2                                   \
+  "1:                                          \n"         \
+  "cmp %w[remain], #1                           \n"        \
+  "blt 4f                                     \n"          \
+  "3:                                         \n"          \
+  "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n"          \
+  "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n"          \
+                                                           \
+  "bif  v2.16b, %[vzero].16b, %[mask1].16b    \n"          \
+  "bif  v3.16b, %[vzero].16b, %[mask2].16b    \n"          \
+                                                           \
+  "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n"          \
+  "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n"          \
+                                                           \
+  "ext  v10.16b, v0.16b, %[vzero].16b, #4     \n"          \
+                                                           \
+  "bif  v6.16b, %[vzero].16b, %[mask1].16b    \n"          \
+  "bif  v7.16b, %[vzero].16b, %[mask2].16b    \n" /* r0 */ \
+  "fmul v11.4s, v0.4s, %[w0].s[0]            \n"           \
+  "fmul v12.4s, v1.4s, %[w0].s[1]            \n"           \
+  "fmla v16.4s, v10.4s, %[w0].s[2]            \n"          \
+                                                           \
+  "ext  v10.16b, v2.16b, %[vzero].16b, #4     \n"          \
+  "bif  v8.16b, %[vzero].16b, %[mask1].16b    \n"          \
+  "bif  v9.16b, %[vzero].16b, %[mask2].16b    \n" /* r1 */ \
+  "fmla v11.4s, v2.4s, %[w1].s[0]            \n"           \
+  "fmla v12.4s, v3.4s, %[w1].s[1]            \n"           \
+  "fmla v16.4s, v10.4s, %[w1].s[2]            \n"          \
+                                                           \
+  "ext  v10.16b, v4.16b, %[vzero].16b, #4     \n" /* r2 */ \
+  "fmul v13.4s, v4.4s, %[w0].s[0]            \n"           \
+  "fmla v11.4s, v4.4s, %[w2].s[0]            \n"           \
+                                                           \
+  "fmul v14.4s, v5.4s, %[w0].s[1]            \n"           \
+  "fmla v12.4s, v5.4s, %[w2].s[1]            \n"           \
+                                                           \
+  "fmla v17.4s, v10.4s, %[w0].s[2]            \n"          \
+  "fmla v16.4s, v10.4s, %[w2].s[2]            \n"          \
+                                                           \
+  "ext  v10.16b, v6.16b, %[vzero].16b, #4     \n" /* r3 */ \
+  "fmla v13.4s, v6.4s, %[w1].s[0]            \n"           \
+  "fmla v14.4s, v7.4s, %[w1].s[1]            \n"           \
+  "fmla v17.4s, v10.4s, %[w1].s[2]            \n"          \
+                                                           \
+  "ext  v10.16b, v8.16b, %[vzero].16b, #4     \n"          \
+  "ld1 {v0.4s}, [%[outptr0]]                  \n"          \
+                                                           \
+  "fadd v16.4s, v16.4s, v11.4s                  \n"        \
+  "fadd v16.4s, v16.4s, v12.4s                  \n"        \
+  "ld1 {v1.4s}, [%[outptr1]]                  \n"
+
+#define RIGHT_RESULT_S2                             \
+  /* r4 */                                          \
+  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"    \
+  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"    \
+  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"   \
+                                                    \
+  "bif  v16.16b, v0.16b, %[wmask].16b    \n"        \
+                                                    \
+  "fadd v17.4s, v17.4s, v13.4s                  \n" \
+                                                    \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n" \
+                                                    \
+  "fadd v17.4s, v17.4s, v14.4s                  \n" \
+                                                    \
+  "bif  v17.16b, v1.16b, %[wmask].16b    \n"        \
+                                                    \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n" \
+  "4:                                          \n"
+
+#define LEFT_RESULT_S2_RELU                         \
+  /* r4 */                                          \
+  "fmla v13.4s, v8.4s, %[w2].s[1]            \n"    \
+  "fmla v14.4s, v9.4s, %[w2].s[2]            \n"    \
+  "fmla v17.4s, v10.4s, %[w2].s[0]            \n"   \
+                                                    \
+  "fmax v16.4s, v16.4s, %[vzero].4s            \n"  \
+                                                    \
+  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"     \
+  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"     \
+  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"     \
+                                                    \
+  "fadd v17.4s, v17.4s, v13.4s                  \n" \
+                                                    \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n" \
+                                                    \
+  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"     \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"     \
+  "ld1 {v15.4s}, [%[inptr0]]                 \n"    \
+                                                    \
+  "fadd v17.4s, v17.4s, v14.4s                  \n" \
+                                                    \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"    \
+                                                    \
+  "ld1 {v18.4s}, [%[inptr1]]                 \n"    \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"    \
+                                                    \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"        \
+                                                    \
+  "fmax v17.4s, v17.4s, %[vzero].4s            \n"  \
+                                                    \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"    \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"    \
+                                                    \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n" \
+                                                    \
+  "cmp %w[cnt], #1                             \n"  \
+                                                    \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"    \
+                                                    \
+  "blt 1f                                     \n"
+
+#define MID_RESULT_S2_RELU                                    \
+  /* r4 */                                                    \
+  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"              \
+  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"              \
+  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"             \
+                                                              \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"               \
+  "ld1 {v15.4s}, [%[inptr0]]                 \n"              \
+  "ld1 {v18.4s}, [%[inptr1]]                 \n"              \
+  "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */ \
+                                                              \
+  "fadd v17.4s, v17.4s, v13.4s                  \n"           \
+                                                              \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"              \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"              \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"              \
+                                                              \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n"           \
+                                                              \
+  "fadd v17.4s, v17.4s, v14.4s                  \n"           \
+                                                              \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"                  \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"              \
+  "subs %w[cnt], %w[cnt], #1                    \n"           \
+                                                              \
+  "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */ \
+                                                              \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n"           \
+                                                              \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"              \
+                                                              \
+  "bne  2b                                    \n"
+
+#define RIGHT_RESULT_S2_RELU                                  \
+  /* r4 */                                                    \
+  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"              \
+  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"              \
+  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"             \
+                                                              \
+  "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */ \
+                                                              \
+  "fadd v17.4s, v17.4s, v13.4s                  \n"           \
+                                                              \
+  "bif  v16.16b, v0.16b, %[wmask].16b    \n"                  \
+                                                              \
+  "fadd v17.4s, v17.4s, v14.4s                  \n"           \
+                                                              \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n"           \
+                                                              \
+  "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */ \
+                                                              \
+  "bif  v17.16b, v1.16b, %[wmask].16b    \n"                  \
+                                                              \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n"           \
+  "4:                                          \n"
+
+#define COMPUTE_S_S2                                  \
+  "movi v9.4s, #0                                 \n" \
+  "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n" \
+                                                      \
+  "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n" \
+  "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n" \
+  "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n" \
+                                                      \
+  "bif v10.16b, v9.16b, v6.16b                    \n" \
+  "bif v11.16b, v9.16b, v7.16b                    \n" \
+  "bif v12.16b, v9.16b, v6.16b                    \n" \
+  "bif v13.16b, v9.16b, v7.16b                    \n" \
+  "bif v14.16b, v9.16b, v6.16b                    \n" \
+  "bif v15.16b, v9.16b, v7.16b                    \n" \
+                                                      \
+  "ext v6.16b, v9.16b, v11.16b, #12               \n" \
+  "ext v7.16b, v9.16b, v13.16b, #12               \n" \
+  "ext v8.16b, v9.16b, v15.16b, #12               \n" \
+                                                      \
+  "fmul v4.4s, v10.4s, %[wr0].s[1]                \n" \
+  "fmul v5.4s, v11.4s, %[wr0].s[2]                \n" \
+  "fmul v6.4s, v6.4s,  %[wr0].s[0]                \n" \
+                                                      \
+  "fmla v4.4s, v12.4s, %[wr1].s[1]                \n" \
+  "fmla v5.4s, v13.4s, %[wr1].s[2]                \n" \
+  "fmla v6.4s, v7.4s,  %[wr1].s[0]                \n" \
+                                                      \
+  "fmla v4.4s, v14.4s, %[wr2].s[1]                \n" \
+  "fmla v5.4s, v15.4s, %[wr2].s[2]                \n" \
+  "fmla v6.4s, v8.4s,  %[wr2].s[0]                \n" \
+                                                      \
+  "fadd v4.4s, v4.4s, v5.4s                       \n" \
+  "fadd v4.4s, v4.4s, v6.4s                       \n"
+
+#define RESULT_S_S2                                   \
+  "fadd v4.4s, v4.4s, %[bias].4s                  \n" \
+                                                      \
+  "st1 {v4.4s}, [%[out]]                          \n"
+
+#define RESULT_S_S2_RELU                              \
+  "fadd v4.4s, v4.4s, %[bias].4s                  \n" \
+  "fmax v4.4s, v4.4s, v9.4s                       \n" \
+                                                      \
+  "st1 {v4.4s}, [%[out]]                          \n"
+
+#define COMPUTE_S_S2_P0                                \
+  "movi v9.4s, #0                                 \n"  \
+  "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n"  \
+                                                       \
+  "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n"  \
+  "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n"  \
+  "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n"  \
+  "and  v4.16b, %[bias].16b, %[bias].16b  \n"          \
+                                                       \
+  "bif v10.16b, v9.16b, v6.16b                    \n"  \
+  "bif v11.16b, v9.16b, v7.16b                    \n"  \
+  "bif v12.16b, v9.16b, v6.16b                    \n"  \
+  "bif v13.16b, v9.16b, v7.16b                    \n"  \
+  "bif v14.16b, v9.16b, v6.16b                    \n"  \
+  "bif v15.16b, v9.16b, v7.16b                    \n"  \
+                                                       \
+  "ext v6.16b, v10.16b, v9.16b, #4               \n"   \
+  "ext v7.16b, v12.16b, v9.16b, #4               \n"   \
+  "ext v8.16b, v14.16b, v9.16b, #4               \n"   \
+                                                       \
+  "fmla v4.4s, v10.4s, %[wr0].s[0]                \n"  \
+  "fmul v5.4s, v11.4s, %[wr0].s[1]                \n"  \
+  "fmul v16.4s, v6.4s,  %[wr0].s[2]                \n" \
+                                                       \
+  "fmla v4.4s, v12.4s, %[wr1].s[0]                \n"  \
+  "fmla v5.4s, v13.4s, %[wr1].s[1]                \n"  \
+  "fmla v16.4s, v7.4s,  %[wr1].s[2]                \n" \
+                                                       \
+  "fmla v4.4s, v14.4s, %[wr2].s[0]                \n"  \
+  "fmla v5.4s, v15.4s, %[wr2].s[1]                \n"  \
+  "fmla v16.4s, v8.4s,  %[wr2].s[2]                \n" \
+                                                       \
+  "fadd v4.4s, v4.4s, v5.4s                       \n"  \
+  "fadd v4.4s, v4.4s, v16.4s                       \n"
+
+#define RESULT_S_S2_P0 "st1 {v4.4s}, [%[out]]                          \n"
+
+#define RESULT_S_S2_P0_RELU                           \
+  "fmax v4.4s, v4.4s, v9.4s                       \n" \
+  "st1 {v4.4s}, [%[out]]                          \n"
+
+#else
+#define INIT_S2                                                     \
+  "vmov.u32 q9, #0                                \n"               \
+  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r1\n"  \
+  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  \
+  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  \
+  "pld [%[din0_ptr]]                              @ preload data\n" \
+  "pld [%[din1_ptr]]                              @ preload data\n" \
+  "pld [%[din2_ptr]]                              @ preload data\n" \
+                                                                    \
+  "vdup.32 q3, %[bias]                            @ and \n"
+
+#define LEFT_COMPUTE_S2                                                   \
+  "vext.32 q6, q9, q11, #3                        @ shift right 1 data\n" \
+  "vext.32 q7, q9, q13, #3                        @ shift right 1 data\n" \
+  "vext.32 q8, q9, q15, #3                        @ shift right 1 data\n" \
+  "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 1, out0\n" \
+  "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 1, out0\n" \
+  "vmla.f32 q3,  q6, %e[wr0][0]                   @ mul weight 1, out0\n" \
+                                                                          \
+  "sub %[din0_ptr], #4                            @ inpitr0 - 1\n"        \
+  "sub %[din1_ptr], #4                            @ inpitr1 - 1\n"        \
+  "sub %[din2_ptr], #4                            @ inpitr2 - 1\n"        \
+                                                                          \
+  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"        \
+                                                                          \
+  "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, out0\n" \
+  "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, out0\n" \
+  "vmla.f32 q3,  q7, %e[wr1][0]                   @ mul weight 1, out0\n" \
+                                                                          \
+  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"        \
+                                                                          \
+  "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 1, out1\n" \
+  "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 1, out1\n" \
+  "vmla.f32 q3,  q8, %e[wr2][0]                   @ mul weight 1, out1\n" \
+                                                                          \
+  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"        \
+                                                                          \
+  "vadd.f32 q3, q3, q4                            @ add \n"               \
+  "vadd.f32 q3, q3, q5                            @ add \n"
+
+#define LEFT_RESULT_S2                                \
+  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
+  "cmp %[cnt], #1                                 \n" \
+  "blt 1f                                         \n"
+
+#define MID_COMPUTE_S2                                                    \
+  "2:                                             \n"                     \
+  "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"        \
+  "vdup.32  q3, %[bias]                           @ and \n"               \
+  "vext.32  q6, q10, q8, #1                       @ shift left 1 \n"      \
+  "vld1.32 {d16}, [%[din1_ptr]]                   @ load din r1\n"        \
+                                                                          \
+  "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, out0\n" \
+  "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, out0\n" \
+  "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, out0\n" \
+                                                                          \
+  "vext.32  q7, q12, q8, #1                       @ shift left 1 \n"      \
+  "vld1.32 {d16}, [%[din2_ptr]]                   @ load din r1\n"        \
+                                                                          \
+  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"        \
+                                                                          \
+  "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, out0\n" \
+  "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, out0\n" \
+  "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, out0\n" \
+                                                                          \
+  "vext.32  q6, q14, q8, #1                       @ shift left 1 \n"      \
+                                                                          \
+  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"        \
+                                                                          \
+  "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, out0\n" \
+  "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, out0\n" \
+  "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, out0\n" \
+                                                                          \
+  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"        \
+                                                                          \
+  "vadd.f32 q3, q3, q4                            @ add \n"               \
+  "vadd.f32 q3, q3, q5                            @ add \n"
+
+#define MID_RESULT_S2                                 \
+  "subs %[cnt], #1                                \n" \
+                                                      \
+  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
+  "bne  2b                                        \n"
+
+#define RIGHT_COMPUTE_S2                                                    \
+  "1:                                             \n"                       \
+  "cmp %[remain], #1                              \n"                       \
+  "blt 3f                                         \n"                       \
+                                                                            \
+  "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"            \
+  "vdup.32  q3, %[bias]                           @ and \n"                 \
+                                                                            \
+  "vbif q10, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q11, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q12, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q13, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q14, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q15, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+                                                                            \
+  "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"        \
+  "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"        \
+                                                                            \
+  "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, out0\n"   \
+  "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, out0\n"   \
+  "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, out0\n"   \
+                                                                            \
+  "vext.32 q6, q14, q9, #1                        @ shift left 1 \n"        \
+  "vld1.f32   {d20-d21}, [%[outptr]]              @ load output\n"          \
+                                                                            \
+  "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, out0\n"   \
+  "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, out0\n"   \
+  "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, out0\n"   \
+                                                                            \
+  "vld1.f32   {d22-d23}, [%[mask_ptr]]            @ load mask\n"            \
+                                                                            \
+  "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, out0\n"   \
+  "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, out0\n"   \
+  "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, out0\n"   \
+                                                                            \
+  "vadd.f32 q3, q3, q4                            @ add \n"                 \
+  "vadd.f32 q3, q3, q5                            @ add \n"
+
+#define RIGHT_RESULT_S2                                           \
+  "vbif.f32 q3, q10, q11                          @ write mask\n" \
+                                                                  \
+  "vst1.32 {d6-d7}, [%[outptr]]!                  \n"             \
+  "3:                                             \n"
+
+#define LEFT_RESULT_S2_RELU                           \
+  "vmax.f32 q3, q3, q9                    @ relu \n"  \
+  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
+  "cmp %[cnt], #1                                 \n" \
+  "blt 1f                                         \n"
+
+#define MID_RESULT_S2_RELU                            \
+  "vmax.f32 q3, q3, q9                    @ relu \n"  \
+  "subs %[cnt], #1                                \n" \
+                                                      \
+  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
+  "bne  2b                                        \n"
+
+#define RIGHT_RESULT_S2_RELU                                      \
+  "vmax.f32 q3, q3, q9                    @ relu \n"              \
+  "vbif.f32 q3, q10, q11                          @ write mask\n" \
+                                                                  \
+  "vst1.32 {d6-d7}, [%[outptr]]!                  \n"             \
+  "3:                                             \n"
+
+#define COMPUTE_S_S2                                                        \
+  "vmov.u32 q9, #0                                \n"                       \
+  "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"            \
+  "vdup.32  q3, %[bias]                           @ and \n"                 \
+                                                                            \
+  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"          \
+  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"          \
+  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"          \
+                                                                            \
+  "vbif q10, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q11, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q12, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q13, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q14, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q15, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+                                                                            \
+  "vext.32 q6, q9, q11, #3                        @ shift left 1 \n"        \
+  "vext.32 q7, q9, q13, #3                        @ shift left 1 \n"        \
+  "vext.32 q8, q9, q15, #3                        @ shift left 1 \n"        \
+                                                                            \
+  "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 0, out0\n"   \
+  "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 0, out0\n"   \
+  "vmla.f32 q3, q6,  %e[wr0][0]                   @ mul weight 0, out0\n"   \
+                                                                            \
+  "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, out0\n"   \
+  "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, out0\n"   \
+  "vmla.f32 q3, q7,  %e[wr1][0]                   @ mul weight 1, out0\n"   \
+                                                                            \
+  "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 2, out0\n"   \
+  "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 2, out0\n"   \
+  "vmla.f32 q3, q8,  %e[wr2][0]                   @ mul weight 2, out0\n"   \
+                                                                            \
+  "vadd.f32 q3, q3, q4                            @ add \n"                 \
+  "vadd.f32 q3, q3, q5                            @ add \n"
+
+#define RESULT_S_S2 "vst1.32 {d6-d7}, [%[out]]                            \n"
+
+#define RESULT_S_S2_RELU                                    \
+  "vmax.f32 q3, q3, q9                            @ relu\n" \
+                                                            \
+  "vst1.32 {d6-d7}, [%[out]]                            \n"
+
+#define COMPUTE_S_S2_P0                                                     \
+  "vmov.u32 q9, #0                                \n"                       \
+  "vld1.f32   {d12-d15}, [%[mask_ptr]]           @ load mask\n"             \
+  "vdup.32  q3, %[bias]                           @ and \n"                 \
+                                                                            \
+  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"          \
+  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"          \
+  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"          \
+                                                                            \
+  "vbif q10, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q11, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q12, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q13, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q14, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q15, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+                                                                            \
+  "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"        \
+  "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"        \
+  "vext.32 q8, q14, q9, #1                        @ shift left 1 \n"        \
+                                                                            \
+  "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, out0\n"   \
+  "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, out0\n"   \
+  "vmla.f32 q3, q6,  %f[wr0][0]                   @ mul weight 0, out0\n"   \
+                                                                            \
+  "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, out0\n"   \
+  "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, out0\n"   \
+  "vmla.f32 q3, q7,  %f[wr1][0]                   @ mul weight 1, out0\n"   \
+                                                                            \
+  "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, out0\n"   \
+  "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, out0\n"   \
+  "vmla.f32 q3, q8,  %f[wr2][0]                   @ mul weight 2, out0\n"   \
+                                                                            \
+  "vadd.f32 q3, q3, q4                            @ add \n"                 \
+  "vadd.f32 q3, q3, q5                            @ add \n"
+
+#define RESULT_S_S2_P0 "vst1.32 {d6-d7}, [%[out]]                            \n"
+
+#define RESULT_S_S2_P0_RELU                                  \
+  "vmax.f32 q3, q3, q9                            @ relu \n" \
+  "vst1.32 {d6-d7}, [%[out]]                            \n"
+
+#endif
+
+/**
+ * \brief depthwise convolution kernel 3x3, stride 2
+ * w_in > 7
+ */
+void conv_depthwise_3x3s2p1_bias_relu(float* dout,
+                                      const float* din,
+                                      const float* weights,
+                                      const float* bias,
+                                      bool flag_bias,
+                                      bool flag_relu,
+                                      const int num,
+                                      const int ch_in,
+                                      const int h_in,
+                                      const int w_in,
+                                      const int h_out,
+                                      const int w_out,
+                                      ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+  int size_pad_bottom = h_out * 2 - h_in;
+
+  int cnt_col = (w_out >> 2) - 2;
+  int size_right_remain = w_in - (7 + cnt_col * 8);
+  if (size_right_remain >= 9) {
+    cnt_col++;
+    size_right_remain -= 8;
+  }
+  int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);  //
+
+  int size_right_pad = w_out * 2 - w_in;
+
+  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+  uint32x4_t wmask =
+      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  float* zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float* write_ptr = zero_ptr + w_in;
+
+  unsigned int dmask[12];
+
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+  vst1q_u32(dmask + 8, wmask);
+
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float32x4_t vzero = vdupq_n_f32(0.f);
+#ifdef __aarch64__
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+#else
+      float bias_c = 0.f;
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+#endif  // __aarch64__
+
+      const float* dr0 = din_channel;
+      const float* dr1 = dr0 + w_in;
+      const float* dr2 = dr1 + w_in;
+      const float* dr3 = dr2 + w_in;
+      const float* dr4 = dr3 + w_in;
+
+      const float* din0_ptr = dr0;
+      const float* din1_ptr = dr1;
+      const float* din2_ptr = dr2;
+      const float* din3_ptr = dr3;
+      const float* din4_ptr = dr4;
+
+      float* doutr0 = dout_channel;
+      float* doutr0_ptr = nullptr;
+      float* doutr1_ptr = nullptr;
+
+#ifdef __aarch64__
+      for (int i = 0; i < h_in; i += 4) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+        din3_ptr = dr3;
+        din4_ptr = dr4;
+
+        doutr0_ptr = doutr0;
+        doutr1_ptr = doutr0 + w_out;
+
+        if (i == 0) {
+          din0_ptr = zero_ptr;
+          din1_ptr = dr0;
+          din2_ptr = dr1;
+          din3_ptr = dr2;
+          din4_ptr = dr3;
+          dr0 = dr3;
+          dr1 = dr4;
+        } else {
+          dr0 = dr4;
+          dr1 = dr0 + w_in;
+        }
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+
+        //! process bottom pad
+        if (i + 4 > h_in) {
+          switch (i + 4 - h_in) {
+            case 4:
+              din1_ptr = zero_ptr;
+            case 3:
+              din2_ptr = zero_ptr;
+            case 2:
+              din3_ptr = zero_ptr;
+            case 1:
+              din4_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process output pad
+        if (i / 2 + 2 > h_out) {
+          doutr1_ptr = write_ptr;
+        }
+        int cnt = cnt_col;
+        if (flag_relu) {
+          asm volatile(
+              INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
+                  MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
+              : [inptr0] "+r"(din0_ptr),
+                [inptr1] "+r"(din1_ptr),
+                [inptr2] "+r"(din2_ptr),
+                [inptr3] "+r"(din3_ptr),
+                [inptr4] "+r"(din4_ptr),
+                [outptr0] "+r"(doutr0_ptr),
+                [outptr1] "+r"(doutr1_ptr),
+                [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero),
+                [w0] "w"(wr0),
+                [w1] "w"(wr1),
+                [w2] "w"(wr2),
+                [remain] "r"(cnt_remain),
+                [mask1] "w"(vmask_rp1),
+                [mask2] "w"(vmask_rp2),
+                [wmask] "w"(wmask),
+                [vbias] "w"(wbias)
+              : "cc",
+                "memory",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11",
+                "v12",
+                "v13",
+                "v14",
+                "v15",
+                "v16",
+                "v17",
+                "v18",
+                "v19",
+                "v20",
+                "v21");
+        } else {
+          asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
+                           MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
+                       : [inptr0] "+r"(din0_ptr),
+                         [inptr1] "+r"(din1_ptr),
+                         [inptr2] "+r"(din2_ptr),
+                         [inptr3] "+r"(din3_ptr),
+                         [inptr4] "+r"(din4_ptr),
+                         [outptr0] "+r"(doutr0_ptr),
+                         [outptr1] "+r"(doutr1_ptr),
+                         [cnt] "+r"(cnt)
+                       : [vzero] "w"(vzero),
+                         [w0] "w"(wr0),
+                         [w1] "w"(wr1),
+                         [w2] "w"(wr2),
+                         [remain] "r"(cnt_remain),
+                         [mask1] "w"(vmask_rp1),
+                         [mask2] "w"(vmask_rp2),
+                         [wmask] "w"(wmask),
+                         [vbias] "w"(wbias)
+                       : "cc",
+                         "memory",
+                         "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15",
+                         "v16",
+                         "v17",
+                         "v18",
+                         "v19",
+                         "v20",
+                         "v21");
+        }
+        doutr0 = doutr0 + 2 * w_out;
+      }
+#else
+      for (int i = 0; i < h_in; i += 2) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+
+        doutr0_ptr = doutr0;
+
+        if (i == 0) {
+          din0_ptr = zero_ptr;
+          din1_ptr = dr0;
+          din2_ptr = dr1;
+          dr0 = dr1;
+          dr1 = dr2;
+          dr2 = dr1 + w_in;
+        } else {
+          dr0 = dr2;
+          dr1 = dr0 + w_in;
+          dr2 = dr1 + w_in;
+        }
+
+        //! process bottom pad
+        if (i + 2 > h_in) {
+          switch (i + 2 - h_in) {
+            case 2:
+              din1_ptr = zero_ptr;
+            case 1:
+              din2_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        int cnt = cnt_col;
+        unsigned int* mask_ptr = dmask;
+        if (flag_relu) {
+          asm volatile(
+              INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
+                  MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
+              : [din0_ptr] "+r"(din0_ptr),
+                [din1_ptr] "+r"(din1_ptr),
+                [din2_ptr] "+r"(din2_ptr),
+                [outptr] "+r"(doutr0_ptr),
+                [cnt] "+r"(cnt),
+                [mask_ptr] "+r"(mask_ptr)
+              : [remain] "r"(cnt_remain),
+                [wr0] "w"(wr0),
+                [wr1] "w"(wr1),
+                [wr2] "w"(wr2),
+                [bias] "r"(bias_c)
+              : "cc",
+                "memory",
+                "q3",
+                "q4",
+                "q5",
+                "q6",
+                "q7",
+                "q8",
+                "q9",
+                "q10",
+                "q11",
+                "q12",
+                "q13",
+                "q14",
+                "q15");
+        } else {
+          asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
+                           MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [outptr] "+r"(doutr0_ptr),
+                         [cnt] "+r"(cnt),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [remain] "r"(cnt_remain),
+                         [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+        doutr0 = doutr0 + w_out;
+      }
+#endif
+    }
+  }
+}
+
+/**
+ * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
+ */
+void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
+                                        const float* din,
+                                        const float* weights,
+                                        const float* bias,
+                                        bool flag_bias,
+                                        bool flag_relu,
+                                        const int num,
+                                        const int ch_in,
+                                        const int h_in,
+                                        const int w_in,
+                                        const int h_out,
+                                        const int w_out,
+                                        ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+  float zeros[8] = {0.0f};
+
+  uint32x4_t vmask_rp1 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  unsigned int dmask[8];
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float bias_c = 0.f;
+
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+      float32x4_t vbias = vdupq_n_f32(bias_c);
+      int hs = -1;
+      int he = 2;
+      float out_buf[4];
+      for (int j = 0; j < h_out; ++j) {
+        const float* dr0 = din_channel + hs * w_in;
+        const float* dr1 = dr0 + w_in;
+        const float* dr2 = dr1 + w_in;
+        if (hs == -1) {
+          dr0 = zeros;
+        }
+        if (he > h_in) {
+          dr2 = zeros;
+        }
+        const float* din0_ptr = dr0;
+        const float* din1_ptr = dr1;
+        const float* din2_ptr = dr2;
+
+        unsigned int* mask_ptr = dmask;
+#ifdef __aarch64__
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "w"(vbias),
+                         [out] "r"(out_buf)
+                       : "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15");
+        } else {
+          asm volatile(COMPUTE_S_S2 RESULT_S_S2
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "w"(vbias),
+                         [out] "r"(out_buf)
+                       : "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15");
+        }
+#else
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c),
+                         [out] "r"(out_buf)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        } else {
+          asm volatile(COMPUTE_S_S2 RESULT_S_S2
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c),
+                         [out] "r"(out_buf)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *dout_channel++ = out_buf[w];
+        }
+        hs += 2;
+        he += 2;
+      }
+    }
+  }
+}
+
+/**
+ * \brief depthwise convolution kernel 3x3, stride 2
+ */
+// w_in > 7
+void conv_depthwise_3x3s2p0_bias_relu(float* dout,
+                                      const float* din,
+                                      const float* weights,
+                                      const float* bias,
+                                      bool flag_bias,
+                                      bool flag_relu,
+                                      const int num,
+                                      const int ch_in,
+                                      const int h_in,
+                                      const int w_in,
+                                      const int h_out,
+                                      const int w_out,
+                                      ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+
+  int tile_w = w_out >> 2;
+  int cnt_remain = w_out % 4;
+
+  unsigned int size_right_remain = (unsigned int)(8 + (tile_w << 3) - w_in);
+  size_right_remain = 8 - size_right_remain;
+
+  if (cnt_remain == 0 && size_right_remain == 0) {
+    cnt_remain = 4;
+    tile_w -= 1;
+    size_right_remain = 8;
+  }
+  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+  uint32x4_t wmask =
+      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  float* zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float* write_ptr = zero_ptr + w_in;
+
+  unsigned int dmask[12];
+
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+  vst1q_u32(dmask + 8, wmask);
+
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float32x4_t vzero = vdupq_n_f32(0.f);
+
+#ifdef __aarch64__
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+#else
+      float bias_c = 0.f;
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+#endif  // __aarch64__
+
+      const float* dr0 = din_channel;
+      const float* dr1 = dr0 + w_in;
+      const float* dr2 = dr1 + w_in;
+      const float* dr3 = dr2 + w_in;
+      const float* dr4 = dr3 + w_in;
+
+      const float* din0_ptr = dr0;
+      const float* din1_ptr = dr1;
+      const float* din2_ptr = dr2;
+      const float* din3_ptr = dr3;
+      const float* din4_ptr = dr4;
+
+      float* doutr0 = dout_channel;
+      float* doutr0_ptr = nullptr;
+      float* doutr1_ptr = nullptr;
+
+#ifdef __aarch64__
+      for (int i = 0; i < h_out; i += 2) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+        din3_ptr = dr3;
+        din4_ptr = dr4;
+
+        doutr0_ptr = doutr0;
+        doutr1_ptr = doutr0 + w_out;
+
+        dr0 = dr4;
+        dr1 = dr0 + w_in;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+
+        //! process bottom pad
+        if (i * 2 + 5 > h_in) {
+          switch (i * 2 + 5 - h_in) {
+            case 4:
+              din1_ptr = zero_ptr;
+            case 3:
+              din2_ptr = zero_ptr;
+            case 2:
+              din3_ptr = zero_ptr;
+            case 1:
+              din4_ptr = zero_ptr;
+            case 0:
+              din4_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process output pad
+        if (i + 2 > h_out) {
+          doutr1_ptr = write_ptr;
+        }
+        int cnt = tile_w;
+        if (flag_relu) {
+          asm volatile(
+              INIT_S2
+              "ld1 {v15.4s}, [%[inptr0]]                 \n"
+              "ld1 {v18.4s}, [%[inptr1]]                 \n"
+              "ld1 {v19.4s}, [%[inptr2]]                 \n"
+              "ld1 {v20.4s}, [%[inptr3]]                 \n"
+              "ld1 {v21.4s}, [%[inptr4]]                 \n"
+              "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+              MID_COMPUTE_S2 MID_RESULT_S2_RELU
+              "cmp %w[remain], #1                           \n"
+              "blt 4f                                     \n" RIGHT_COMPUTE_S2
+                  RIGHT_RESULT_S2_RELU
+              "4:                                          \n"
+              : [inptr0] "+r"(din0_ptr),
+                [inptr1] "+r"(din1_ptr),
+                [inptr2] "+r"(din2_ptr),
+                [inptr3] "+r"(din3_ptr),
+                [inptr4] "+r"(din4_ptr),
+                [outptr0] "+r"(doutr0_ptr),
+                [outptr1] "+r"(doutr1_ptr),
+                [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero),
+                [w0] "w"(wr0),
+                [w1] "w"(wr1),
+                [w2] "w"(wr2),
+                [remain] "r"(cnt_remain),
+                [mask1] "w"(vmask_rp1),
+                [mask2] "w"(vmask_rp2),
+                [wmask] "w"(wmask),
+                [vbias] "w"(wbias)
+              : "cc",
+                "memory",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11",
+                "v12",
+                "v13",
+                "v14",
+                "v15",
+                "v16",
+                "v17",
+                "v18",
+                "v19",
+                "v20",
+                "v21");
+        } else {
+          asm volatile(
+              INIT_S2
+              "ld1 {v15.4s}, [%[inptr0]]                 \n"
+              "ld1 {v18.4s}, [%[inptr1]]                 \n"
+              "ld1 {v19.4s}, [%[inptr2]]                 \n"
+              "ld1 {v20.4s}, [%[inptr3]]                 \n"
+              "ld1 {v21.4s}, [%[inptr4]]                 \n"
+              "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+              MID_COMPUTE_S2 MID_RESULT_S2
+              "cmp %w[remain], #1                           \n"
+              "blt 4f                                     \n" RIGHT_COMPUTE_S2
+                  RIGHT_RESULT_S2
+              "4:                                          \n"
+              : [inptr0] "+r"(din0_ptr),
+                [inptr1] "+r"(din1_ptr),
+                [inptr2] "+r"(din2_ptr),
+                [inptr3] "+r"(din3_ptr),
+                [inptr4] "+r"(din4_ptr),
+                [outptr0] "+r"(doutr0_ptr),
+                [outptr1] "+r"(doutr1_ptr),
+                [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero),
+                [w0] "w"(wr0),
+                [w1] "w"(wr1),
+                [w2] "w"(wr2),
+                [remain] "r"(cnt_remain),
+                [mask1] "w"(vmask_rp1),
+                [mask2] "w"(vmask_rp2),
+                [wmask] "w"(wmask),
+                [vbias] "w"(wbias)
+              : "cc",
+                "memory",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11",
+                "v12",
+                "v13",
+                "v14",
+                "v15",
+                "v16",
+                "v17",
+                "v18",
+                "v19",
+                "v20",
+                "v21");
+        }
+        doutr0 = doutr0 + 2 * w_out;
+      }
+#else
+      for (int i = 0; i < h_out; i++) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+
+        doutr0_ptr = doutr0;
+
+        dr0 = dr2;
+        dr1 = dr0 + w_in;
+        dr2 = dr1 + w_in;
+
+        //! process bottom pad
+        if (i * 2 + 3 > h_in) {
+          switch (i * 2 + 3 - h_in) {
+            case 2:
+              din1_ptr = zero_ptr;
+            case 1:
+              din2_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        int cnt = tile_w;
+        unsigned int* mask_ptr = dmask;
+        if (flag_relu) {
+          asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU
+                           RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [outptr] "+r"(doutr0_ptr),
+                         [cnt] "+r"(cnt),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [remain] "r"(cnt_remain),
+                         [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        } else {
+          asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2
+                           RIGHT_RESULT_S2
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [outptr] "+r"(doutr0_ptr),
+                         [cnt] "+r"(cnt),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [remain] "r"(cnt_remain),
+                         [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+        doutr0 = doutr0 + w_out;
+      }
+#endif
+    }
+  }
+}
+
+/**
+ * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
+ */
+void conv_depthwise_3x3s2p0_bias_s_relu(float* dout,
+                                        const float* din,
+                                        const float* weights,
+                                        const float* bias,
+                                        bool flag_bias,
+                                        bool flag_relu,
+                                        const int num,
+                                        const int ch_in,
+                                        const int h_in,
+                                        const int w_in,
+                                        const int h_out,
+                                        const int w_out,
+                                        ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+  float zeros[8] = {0.0f};
+  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
+
+  uint32x4_t vmask_rp1 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  unsigned int dmask[8];
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float bias_c = 0.f;
+
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+      float32x4_t vbias = vdupq_n_f32(bias_c);
+      float out_buf[4];
+      const float* dr0 = din_channel;
+      const float* dr1 = dr0 + w_in;
+      const float* dr2 = dr1 + w_in;
+      for (int j = 0; j < h_out; j++) {
+        const float* din0_ptr = dr0;
+        const float* din1_ptr = dr1;
+        const float* din2_ptr = dr2;
+        if (j * 2 + 2 >= h_in) {
+          switch (j + 2 - h_in) {
+            case 1:
+              din1_ptr = zero_ptr;
+            case 0:
+              din2_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        dr0 = dr2;
+        dr1 = dr0 + w_in;
+        dr2 = dr1 + w_in;
+
+        unsigned int* mask_ptr = dmask;
+#ifdef __aarch64__
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "w"(vbias),
+                         [out] "r"(out_buf)
+                       : "cc",
+                         "memory",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15",
+                         "v16");
+        } else {
+          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "w"(vbias),
+                         [out] "r"(out_buf)
+                       : "cc",
+                         "memory",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15",
+                         "v16");
+        }
+#else
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c),
+                         [out] "r"(out_buf),
+                         [mask_ptr] "r"(dmask)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        } else {
+          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c),
+                         [out] "r"(out_buf),
+                         [mask_ptr] "r"(dmask)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *dout_channel++ = out_buf[w];
+        }
+      }
+    }
+  }
+}
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
index 4617d40f4372f6589f20b50205fb307cdc705808..4bb8554202b8feeea48b07e2057ea5d20606ab8e 100644
--- a/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
@@ -113,9 +113,9 @@ namespace math {
   "fcmge v7.4s, v22.4s,  v0.4s \n"      /* vcgeq_u32 */ \
   "fmul v8.4s, v22.4s, %[vscale].4s \n" /* mul */       \
   "bif  v19.16b, v2.16b, v1.16b \n"     /* choose*/     \
-  "bif  v19.16b, v4.16b, v3.16b \n"     /* choose*/     \
-  "bif  v19.16b, v6.16b, v5.16b \n"     /* choose*/     \
-  "bif  v19.16b, v8.16b, v7.16b \n"     /* choose*/
+  "bif  v20.16b, v4.16b, v3.16b \n"     /* choose*/     \
+  "bif  v21.16b, v6.16b, v5.16b \n"     /* choose*/     \
+  "bif  v22.16b, v8.16b, v7.16b \n"     /* choose*/
 #define STORE                           /* save result */ \
   "str q19, [%[outc0]], #16\n"                            \
   "str q20, [%[outc1]], #16\n"                            \
diff --git a/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc b/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc
index c778896550de73f888979c8337731a0b9967b5dd..0ac1705de76102c92c9e63d64721aa2467baaf04 100644
--- a/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc
+++ b/lite/backends/arm/math/conv5x5s2_depthwise_int8.cc
@@ -102,7 +102,7 @@ void conv_depthwise_5x5s2_int8(Dtype* dout,
       if (h + hout_r_block > hout) {
         h_kernel = hout - h;
       }
-      int hs = h - padh;
+      int hs = h * 2 - padh;
       int he = hs + h_kernel * 2 + 3;
 
 #pragma omp parallel for num_threads(threads)
diff --git a/lite/backends/arm/math/conv_block_utils.h b/lite/backends/arm/math/conv_block_utils.h
index 85404d6a6e2e6246677857be8231e15afa86210d..c4fb51021e5b0288a4bc1fd476764348fdc7e450 100644
--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
@@ -703,7 +703,9 @@ inline void act_switch_c1_fp32(const float* din_ptr,
                        [cnt] "+r"(cnt_loop),
                        [ptr_din] "+r"(din_ptr)
                      :
-                     : "v0",
+                     : "cc",
+                       "memory",
+                       "v0",
                        "v1",
                        "v2",
                        "v3",
@@ -722,7 +724,7 @@ inline void act_switch_c1_fp32(const float* din_ptr,
                        [ptr_din] "+r"(din_ptr),
                        [cnt] "+r"(cnt_loop)
                      :
-                     : "q0", "q1", "q2", "q3", "q15");
+                     : "cc", "memory", "q0", "q1", "q2", "q3", "q15");
 #endif
         break;
       case lite_api::ActivationType::kRelu6:
@@ -734,7 +736,9 @@ inline void act_switch_c1_fp32(const float* din_ptr,
                        [cnt] "+r"(cnt_loop),
                        [ptr_din] "+r"(din_ptr)
                      : [six] "w"(six)
-                     : "v0",
+                     : "cc",
+                       "memory",
+                       "v0",
                        "v1",
                        "v2",
                        "v3",
@@ -753,7 +757,7 @@ inline void act_switch_c1_fp32(const float* din_ptr,
                        [ptr_din] "+r"(din_ptr),
                        [cnt] "+r"(cnt_loop)
                      : [six] "w"(six)
-                     : "q0", "q1", "q2", "q3", "q15");
+                     : "cc", "memory", "q0", "q1", "q2", "q3", "q15");
 #endif
         break;
       case lite_api::ActivationType::kLeakyRelu:
@@ -765,7 +769,9 @@ inline void act_switch_c1_fp32(const float* din_ptr,
                        [cnt] "+r"(cnt_loop),
                        [ptr_din] "+r"(din_ptr)
                      : [scale] "w"(scale)
-                     : "v0",
+                     : "cc",
+                       "memory",
+                       "v0",
                        "v1",
                        "v2",
                        "v3",
@@ -785,7 +791,9 @@ inline void act_switch_c1_fp32(const float* din_ptr,
                        [ptr_din] "+r"(din_ptr),
                        [cnt] "+r"(cnt_loop)
                      : [scale] "w"(scale)
-                     : "q0",
+                     : "cc",
+                       "memory",
+                       "q0",
                        "q1",
                        "q2",
                        "q3",
@@ -812,14 +820,14 @@ inline void act_switch_c1_fp32(const float* din_ptr,
                    [cnt] "+r"(cnt_loop),
                    [ptr_din] "+r"(din_ptr)
                  :
-                 : "v0", "v1", "v2", "v3", "v20");
+                 : "cc", "memory", "v0", "v1", "v2", "v3", "v20");
 #else
     asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_STORE
                  : [doutc0r0] "+r"(doutc0_ptr),
                    [ptr_din] "+r"(din_ptr),
                    [cnt] "+r"(cnt_loop)
                  :
-                 : "q0", "q1", "q2", "q3", "q15");
+                 : "cc", "memory", "q0", "q1", "q2", "q3", "q15");
 #endif
   }
 }
@@ -1006,7 +1014,9 @@ inline void act_switch_c2_fp32(const float* din_ptr,
                        [cnt] "+r"(cnt_loop),
                        [ptr_din] "+r"(din_ptr)
                      :
-                     : "v0",
+                     : "cc",
+                       "memory",
+                       "v0",
                        "v1",
                        "v2",
                        "v3",
@@ -1026,7 +1036,7 @@ inline void act_switch_c2_fp32(const float* din_ptr,
                        [ptr_din] "+r"(din_ptr),
                        [cnt] "+r"(cnt_loop)
                      :
-                     : "q0", "q1", "q2", "q3", "q15");
+                     : "cc", "memory", "q0", "q1", "q2", "q3", "q15");
 #endif
         break;
       case lite_api::ActivationType::kRelu6:
@@ -1039,7 +1049,9 @@ inline void act_switch_c2_fp32(const float* din_ptr,
                        [cnt] "+r"(cnt_loop),
                        [ptr_din] "+r"(din_ptr)
                      : [six] "w"(six)
-                     : "v0",
+                     : "cc",
+                       "memory",
+                       "v0",
                        "v1",
                        "v2",
                        "v3",
@@ -1059,7 +1071,7 @@ inline void act_switch_c2_fp32(const float* din_ptr,
                        [ptr_din] "+r"(din_ptr),
                        [cnt] "+r"(cnt_loop)
                      : [six] "w"(six)
-                     : "q0", "q1", "q2", "q3", "q15");
+                     : "cc", "memory", "q0", "q1", "q2", "q3", "q15");
 #endif
         break;
       case lite_api::ActivationType::kLeakyRelu:
@@ -1072,7 +1084,9 @@ inline void act_switch_c2_fp32(const float* din_ptr,
                        [cnt] "+r"(cnt_loop),
                        [ptr_din] "+r"(din_ptr)
                      : [scale] "w"(scale)
-                     : "v0",
+                     : "cc",
+                       "memory",
+                       "v0",
                        "v1",
                        "v2",
                        "v3",
@@ -1092,7 +1106,9 @@ inline void act_switch_c2_fp32(const float* din_ptr,
                        [ptr_din] "+r"(din_ptr),
                        [cnt] "+r"(cnt_loop)
                      : [scale] "w"(scale)
-                     : "q0",
+                     : "cc",
+                       "memory",
+                       "q0",
                        "q1",
                        "q2",
                        "q3",
@@ -1120,7 +1136,7 @@ inline void act_switch_c2_fp32(const float* din_ptr,
                    [cnt] "+r"(cnt_loop),
                    [ptr_din] "+r"(din_ptr)
                  :
-                 : "v0", "v1", "v2", "v3", "v4", "v5", "v20");
+                 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v20");
 #else
     asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_STORE
                  : [doutc0r0] "+r"(doutc0_ptr),
@@ -1128,7 +1144,7 @@ inline void act_switch_c2_fp32(const float* din_ptr,
                    [ptr_din] "+r"(din_ptr),
                    [cnt] "+r"(cnt_loop)
                  :
-                 : "q0", "q1", "q2", "q3", "q15");
+                 : "cc", "memory", "q0", "q1", "q2", "q3", "q15");
 #endif
   }
 }
@@ -1373,7 +1389,9 @@ inline void act_switch_c4_fp32(const float* din_ptr,
                        [cnt] "+r"(cnt_loop),
                        [ptr_din] "+r"(din_ptr)
                      :
-                     : "v0",
+                     : "cc",
+                       "memory",
+                       "v0",
                        "v1",
                        "v2",
                        "v3",
@@ -1403,7 +1421,7 @@ inline void act_switch_c4_fp32(const float* din_ptr,
                        [ptr_din] "+r"(din_ptr),
                        [cnt] "+r"(cnt_loop)
                      :
-                     : "q0", "q1", "q2", "q3", "q15");
+                     : "cc", "memory", "q0", "q1", "q2", "q3", "q15");
 #endif
         break;
       case lite_api::ActivationType::kRelu6:
@@ -1418,7 +1436,9 @@ inline void act_switch_c4_fp32(const float* din_ptr,
                        [cnt] "+r"(cnt_loop),
                        [ptr_din] "+r"(din_ptr)
                      : [six] "w"(six)
-                     : "v0",
+                     : "cc",
+                       "memory",
+                       "v0",
                        "v1",
                        "v2",
                        "v3",
@@ -1448,7 +1468,7 @@ inline void act_switch_c4_fp32(const float* din_ptr,
                        [ptr_din] "+r"(din_ptr),
                        [cnt] "+r"(cnt_loop)
                      : [six] "w"(six)
-                     : "q0", "q1", "q2", "q3", "q15");
+                     : "cc", "memory", "q0", "q1", "q2", "q3", "q15");
 #endif
         break;
       case lite_api::ActivationType::kLeakyRelu:
@@ -1463,7 +1483,9 @@ inline void act_switch_c4_fp32(const float* din_ptr,
                        [cnt] "+r"(cnt_loop),
                        [ptr_din] "+r"(din_ptr)
                      : [scale] "w"(scale)
-                     : "v0",
+                     : "cc",
+                       "memory",
+                       "v0",
                        "v1",
                        "v2",
                        "v3",
@@ -1493,7 +1515,9 @@ inline void act_switch_c4_fp32(const float* din_ptr,
                        [ptr_din] "+r"(din_ptr),
                        [cnt] "+r"(cnt_loop)
                      : [scale] "w"(scale)
-                     : "q0",
+                     : "cc",
+                       "memory",
+                       "q0",
                        "q1",
                        "q2",
                        "q3",
@@ -1523,7 +1547,9 @@ inline void act_switch_c4_fp32(const float* din_ptr,
                    [cnt] "+r"(cnt_loop),
                    [ptr_din] "+r"(din_ptr)
                  :
-                 : "v0",
+                 : "cc",
+                   "memory",
+                   "v0",
                    "v1",
                    "v2",
                    "v3",
@@ -1544,7 +1570,7 @@ inline void act_switch_c4_fp32(const float* din_ptr,
                    [ptr_din] "+r"(din_ptr),
                    [cnt] "+r"(cnt_loop)
                  :
-                 : "q0", "q1", "q2", "q3", "q15");
+                 : "cc", "memory", "q0", "q1", "q2", "q3", "q15");
 #endif
   }
 }
@@ -1929,7 +1955,9 @@ inline void act_switch_c8_fp32(const float* din_ptr,
                        [cnt] "+r"(cnt_loop),
                        [ptr_din] "+r"(din_ptr)
                      :
-                     : "v0",
+                     : "cc",
+                       "memory",
+                       "v0",
                        "v1",
                        "v2",
                        "v3",
@@ -1963,7 +1991,17 @@ inline void act_switch_c8_fp32(const float* din_ptr,
                        [ptr_din] "+r"(din_ptr),
                        [cnt] "+r"(cnt_loop)
                      :
-                     : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q15");
+                     : "cc",
+                       "memory",
+                       "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q15");
 #endif
         break;
       case lite_api::ActivationType::kRelu6:
@@ -1982,7 +2020,9 @@ inline void act_switch_c8_fp32(const float* din_ptr,
                        [cnt] "+r"(cnt_loop),
                        [ptr_din] "+r"(din_ptr)
                      : [six] "w"(six)
-                     : "v0",
+                     : "cc",
+                       "memory",
+                       "v0",
                        "v1",
                        "v2",
                        "v3",
@@ -2012,7 +2052,17 @@ inline void act_switch_c8_fp32(const float* din_ptr,
                        [ptr_din] "+r"(din_ptr),
                        [cnt] "+r"(cnt_loop)
                      : [six] "w"(six)
-                     : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q15");
+                     : "cc",
+                       "memory",
+                       "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q15");
 #endif
         break;
       case lite_api::ActivationType::kLeakyRelu:
@@ -2031,7 +2081,9 @@ inline void act_switch_c8_fp32(const float* din_ptr,
                        [cnt] "+r"(cnt_loop),
                        [ptr_din] "+r"(din_ptr)
                      : [scale] "w"(scale)
-                     : "v0",
+                     : "cc",
+                       "memory",
+                       "v0",
                        "v1",
                        "v2",
                        "v3",
@@ -2076,7 +2128,9 @@ inline void act_switch_c8_fp32(const float* din_ptr,
                        [ptr_din] "+r"(din_ptr),
                        [cnt] "+r"(cnt_loop)
                      : [scale] "w"(scale)
-                     : "q0",
+                     : "cc",
+                       "memory",
+                       "q0",
                        "q1",
                        "q2",
                        "q3",
@@ -2112,7 +2166,9 @@ inline void act_switch_c8_fp32(const float* din_ptr,
                    [cnt] "+r"(cnt_loop),
                    [ptr_din] "+r"(din_ptr)
                  :
-                 : "v0",
+                 : "cc",
+                   "memory",
+                   "v0",
                    "v1",
                    "v2",
                    "v3",
@@ -2146,7 +2202,17 @@ inline void act_switch_c8_fp32(const float* din_ptr,
                    [ptr_din] "+r"(din_ptr),
                    [cnt] "+r"(cnt_loop)
                  :
-                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q15");
+                 : "cc",
+                   "memory",
+                   "q0",
+                   "q1",
+                   "q2",
+                   "q3",
+                   "q4",
+                   "q5",
+                   "q6",
+                   "q7",
+                   "q15");
 #endif
   }
 }
@@ -2744,8 +2810,18 @@ inline void int32_nchwc4_kernel(int8_t*& dout0,       // NOLINT
                                 float32x4_t bias,
                                 bool is_relu) {
 #ifdef __aarch64__
+  float32x4_t vmax = vdupq_n_f32(-127.f);
   asm volatile(NCHWC4_TRANS_INT32
                "subs   %w[cnt], %w[cnt], #1\n"
+               /* data >= -127 */
+               "fcmge v4.4s, v16.4s, %[vmax].4s             \n"
+               "fcmge v5.4s, v18.4s, %[vmax].4s             \n"
+               "fcmge v6.4s, v17.4s, %[vmax].4s            \n"
+               "fcmge v7.4s, v19.4s, %[vmax].4s            \n"
+               "bif v16.16b, %[vmax].16b, v4.16b            \n"
+               "bif v18.16b, %[vmax].16b, v5.16b            \n"
+               "bif v17.16b, %[vmax].16b, v6.16b            \n"
+               "bif v19.16b, %[vmax].16b, v7.16b            \n"
                /* fp32-int32 */
                "fcvtas  v4.4s, v16.4s\n"
                "fcvtas  v5.4s, v18.4s\n"
@@ -2773,7 +2849,10 @@ inline void int32_nchwc4_kernel(int8_t*& dout0,       // NOLINT
                  [doutc3r0] "+r"(dout3),
                  [ptr_din] "+r"(din),
                  [cnt] "+r"(cnt)
-               : [scale] "w"(scale), [bias] "w"(bias), [relu] "r"(is_relu)
+               : [scale] "w"(scale),
+                 [vmax] "w"(vmax),
+                 [bias] "w"(bias),
+                 [relu] "r"(is_relu)
                : "cc",
                  "memory",
                  "v0",
@@ -2799,6 +2878,7 @@ inline void int32_nchwc4_kernel(int8_t*& dout0,       // NOLINT
                  "v20",
                  "v31");
 #else
+  float vmax[4] = {-127.f, -127.f, -127.f, -127.f};
   asm volatile(NCHWC4_TRANS_INT32
                /* set 0.5 offset */
                "vmov.f32 q2, #0.5\n"
@@ -2815,11 +2895,21 @@ inline void int32_nchwc4_kernel(int8_t*& dout0,       // NOLINT
                "vbif.f32   q3, q14, q7   @ get right offset\n"
                "vbif.f32   q4, q14, q8   @ get right offset\n"
                "vbif.f32   q5, q14, q9   @ get right offset\n"
+               "vld1.32 {d28-d29}, [%[vmax]] \n"
                /* add offset */
                "vadd.f32   q10, q2, q10\n"
                "vadd.f32   q11, q3, q11\n"
                "vadd.f32   q12, q4, q12\n"
                "vadd.f32   q13, q5, q13\n"
+               /* data >= -127 */
+               "vcge.f32 q6, q10, q14     @ q10 >= vmax \n"
+               "vcge.f32 q7, q11, q14     @ q11 >= vmax \n"
+               "vcge.f32 q8, q12, q14     @ q12 >= vmax \n"
+               "vcge.f32 q9, q13, q14     @ q13 >= vmax \n"
+               "vbif q10, q14, q6         @ choose \n"
+               "vbif q11, q14, q7         @ choose \n"
+               "vbif q12, q14, q8         @ choose \n"
+               "vbif q13, q14, q9         @ choose \n"
                /* fp32 to int32 */
                "vcvt.s32.f32  q6, q10    @ cvt to int32\n"
                "vcvt.s32.f32  q7, q11    @ cvt to int32\n"
@@ -2836,7 +2926,7 @@ inline void int32_nchwc4_kernel(int8_t*& dout0,       // NOLINT
                "vqmovn.s16 d14, q12      @ cnt to int8\n"
                "vqmovn.s16 d15, q13      @ cnt to int8\n"
                "subs   %[cnt], %[cnt], #1\n"
-               /* store */
+               /* store data*/
                "vld1.32 {d4-d7}, [%[ptr_din]]!\n"
                "vst1.32 {d12[0]},    [%[doutc0r0]]!\n"
                "vst1.32 {d13[0]},    [%[doutc1r0]]!\n"
@@ -2850,7 +2940,10 @@ inline void int32_nchwc4_kernel(int8_t*& dout0,       // NOLINT
                  [doutc3r0] "+r"(dout3),
                  [ptr_din] "+r"(din),
                  [cnt] "+r"(cnt)
-               : [scale] "w"(scale), [bias] "w"(bias), [relu] "r"(is_relu)
+               : [scale] "w"(scale),
+                 [bias] "w"(bias),
+                 [relu] "r"(is_relu),
+                 [vmax] "r"(vmax)
                : "cc",
                  "memory",
                  "q2",
@@ -2989,8 +3082,10 @@ template <>
 inline int8_t cvt_kernel(int din, float scale, float bias, bool flag_relu) {
   if (flag_relu) {
     return saturate_cast<int8_t>(round(LITEMAX(din * scale + bias, 0)));
+  } else {
+    auto tmp = saturate_cast<int8_t>(round(din * scale + bias));
+    return tmp < -127 ? -127 : tmp;
   }
-  return saturate_cast<int8_t>(round(din * scale + bias));
 }
 
 template <>
@@ -3362,7 +3457,27 @@ inline void int32_nchwc8_kernel(int8_t*& dout0,       // NOLINT
                                 float32x4_t bias1,
                                 bool is_relu) {
 #ifdef __aarch64__
+  float32x4_t vmax = vdupq_n_f32(-127.f);
   asm volatile(INT32_NCHWC8_TO_NCHW_FP32 /* fp32-int32 */
+               /* data >= -127 */
+               "fcmge v10.4s, v16.4s, %[vmax].4s             \n"
+               "fcmge v11.4s, v17.4s, %[vmax].4s             \n"
+               "fcmge v14.4s, v18.4s, %[vmax].4s            \n"
+               "fcmge v15.4s, v19.4s, %[vmax].4s            \n"
+               "fcmge v20.4s, v8.4s, %[vmax].4s             \n"
+               "fcmge v21.4s, v9.4s, %[vmax].4s             \n"
+               "fcmge v22.4s, v12.4s, %[vmax].4s            \n"
+               "fcmge v23.4s, v13.4s, %[vmax].4s            \n"
+               /* choose data */
+               "bif v16.16b, %[vmax].16b, v10.16b            \n"
+               "bif v17.16b, %[vmax].16b, v11.16b            \n"
+               "bif v18.16b, %[vmax].16b, v14.16b            \n"
+               "bif v19.16b, %[vmax].16b, v15.16b            \n"
+               "bif v8.16b, %[vmax].16b, v20.16b            \n"
+               "bif v9.16b, %[vmax].16b, v21.16b            \n"
+               "bif v12.16b, %[vmax].16b, v22.16b            \n"
+               "bif v13.16b, %[vmax].16b, v23.16b            \n"
+               /* fp32 - int32 */
                "fcvtas  v10.4s, v16.4s\n"
                "fcvtas  v11.4s, v17.4s\n"
                "fcvtas  v14.4s, v18.4s\n"
@@ -3413,6 +3528,7 @@ inline void int32_nchwc8_kernel(int8_t*& dout0,       // NOLINT
                  [scale1] "w"(scale1),
                  [bias0] "w"(bias0),
                  [bias1] "w"(bias1),
+                 [vmax] "w"(vmax),
                  [relu] "r"(is_relu)
                : "cc",
                  "memory",
@@ -3442,6 +3558,7 @@ inline void int32_nchwc8_kernel(int8_t*& dout0,       // NOLINT
                  "v23",
                  "v31");
 #else
+  float vmax[4] = {-127.f, -127.f, -127.f, -127.f};
   asm volatile(INT32_NCHWC8_TO_NCHW_FP32 /* set +-0.5 offset */
                "vmov.f32 q10, #-0.5\n"
                "vmov.f32 q9, #0.5\n"
@@ -3475,7 +3592,18 @@ inline void int32_nchwc8_kernel(int8_t*& dout0,       // NOLINT
                "vmov.f32 q9, #0.5\n"
                "vcgt.f32   q11, q7, q8   @ get mask > 0, in0\n"
                "vbif.f32   q9, q10, q11   @ get right offset\n"
+               "vld1.32 {d22-d23}, [%[vmax]] \n"
                "vadd.f32   q7, q7, q9\n"
+               /* data >= -127 */
+               "vcge.f32 q8, q0, q11     @ q10 >= vmax \n"
+               "vcge.f32 q9, q2, q11     @ q10 >= vmax \n"
+               "vcge.f32 q10, q4, q11     @ q10 >= vmax \n"
+               /* choose data */
+               "vbif q0, q11, q8    @ choose \n"
+               "vcge.f32 q8, q6, q11     @ q10 >= vmax \n"
+               "vbif q2, q11, q9    @ choose \n"
+               "vbif q4, q11, q10    @ choose \n"
+               "vbif q6, q11, q8    @ choose \n"
                /* fp32 to int32 */
                "vcvt.s32.f32  q8, q0    @ cvt to int32\n"
                "vcvt.s32.f32  q9, q2    @ cvt to int32\n"
@@ -3486,6 +3614,17 @@ inline void int32_nchwc8_kernel(int8_t*& dout0,       // NOLINT
                "vqmovn.s32 d4, q9       @ cnt to int16\n"
                "vqmovn.s32 d8, q10      @ cnt to int16\n"
                "vqmovn.s32 d12, q11      @ cnt to int16\n"
+               /* data >= -127 */
+               "vld1.32 {d22-d23}, [%[vmax]] \n"
+               "vcge.f32 q8, q1, q11     @ q10 >= vmax \n"
+               "vcge.f32 q9, q3, q11     @ q10 >= vmax \n"
+               "vcge.f32 q10, q5, q11     @ q10 >= vmax \n"
+               /* choose data */
+               "vbif q1, q11, q8    @ choose \n"
+               "vcge.f32 q8, q7, q11     @ q10 >= vmax \n"
+               "vbif q3, q11, q9    @ choose \n"
+               "vbif q5, q11, q10    @ choose \n"
+               "vbif q7, q11, q8    @ choose \n"
                /* fp32 to int32 */
                "vcvt.s32.f32  q8, q1    @ cvt to int32\n"
                "vcvt.s32.f32  q9, q3    @ cvt to int32\n"
@@ -3529,6 +3668,7 @@ inline void int32_nchwc8_kernel(int8_t*& dout0,       // NOLINT
                  [scale1] "w"(scale1),
                  [bias0] "w"(bias0),
                  [bias1] "w"(bias1),
+                 [vmax] "r"(vmax),
                  [relu] "r"(is_relu)
                : "cc",
                  "memory",
diff --git a/lite/backends/arm/math/conv_depthwise.h b/lite/backends/arm/math/conv_depthwise.h
index 4c5f284a19f615382ea04904184427f569f95ff3..72d887ce4e630057286d98c86970def4a9efdb04 100644
--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
@@ -207,6 +207,118 @@ void conv_depthwise_5x5s2_int8(Dtype* dout,
                                int padh,
                                ARMContext* ctx);
 
+void conv_depthwise_3x3s1p0_bias_relu(float* dout,
+                                      const float* din,
+                                      const float* weights,
+                                      const float* bias,
+                                      bool flag_bias,
+                                      bool flag_relu,
+                                      const int num,
+                                      const int ch_in,
+                                      const int h_in,
+                                      const int w_in,
+                                      const int h_out,
+                                      const int w_out,
+                                      ARMContext* ctx);
+
+void conv_depthwise_3x3s1p0_bias_s_relu(float* dout,
+                                        const float* din,
+                                        const float* weights,
+                                        const float* bias,
+                                        bool flag_bias,
+                                        bool flag_relu,
+                                        const int num,
+                                        const int ch_in,
+                                        const int h_in,
+                                        const int w_in,
+                                        const int h_out,
+                                        const int w_out,
+                                        ARMContext* ctx);
+
+void conv_depthwise_3x3s1p1_bias_relu(float* dout,
+                                      const float* din,
+                                      const float* weights,
+                                      const float* bias,
+                                      bool flag_bias,
+                                      bool flag_relu,
+                                      const int num,
+                                      const int ch_in,
+                                      const int h_in,
+                                      const int w_in,
+                                      const int h_out,
+                                      const int w_out,
+                                      ARMContext* ctx);
+
+void conv_depthwise_3x3s1p1_bias_s_relu(float* dout,
+                                        const float* din,
+                                        const float* weights,
+                                        const float* bias,
+                                        bool flag_bias,
+                                        bool flag_relu,
+                                        const int num,
+                                        const int ch_in,
+                                        const int h_in,
+                                        const int w_in,
+                                        const int h_out,
+                                        const int w_out,
+                                        ARMContext* ctx);
+
+void conv_depthwise_3x3s2p0_bias_relu(float* dout,
+                                      const float* din,
+                                      const float* weights,
+                                      const float* bias,
+                                      bool flag_bias,
+                                      bool flag_relu,
+                                      const int num,
+                                      const int ch_in,
+                                      const int h_in,
+                                      const int w_in,
+                                      const int h_out,
+                                      const int w_out,
+                                      ARMContext* ctx);
+
+void conv_depthwise_3x3s2p0_bias_s_relu(float* dout,
+                                        const float* din,
+                                        const float* weights,
+                                        const float* bias,
+                                        bool flag_bias,
+                                        bool flag_relu,
+                                        const int num,
+                                        const int ch_in,
+                                        const int h_in,
+                                        const int w_in,
+                                        const int h_out,
+                                        const int w_out,
+                                        ARMContext* ctx);
+
+void conv_depthwise_3x3s2p1_bias_relu(float* dout,
+                                      const float* din,
+                                      const float* weights,
+                                      const float* bias,
+                                      bool flag_bias,
+                                      bool flag_relu,
+                                      const int num,
+                                      const int ch_in,
+                                      const int h_in,
+                                      const int w_in,
+                                      const int h_out,
+                                      const int w_out,
+                                      ARMContext* ctx);
+
+void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
+                                        const float* din,
+                                        const float* weights,
+                                        const float* bias,
+                                        bool flag_bias,
+                                        bool flag_relu,
+                                        const int num,
+                                        const int ch_in,
+                                        const int h_in,
+                                        const int w_in,
+                                        const int h_out,
+                                        const int w_out,
+                                        ARMContext* ctx);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/conv_impl.cc b/lite/backends/arm/math/conv_impl.cc
index 96d0893bc0f0a1c145f4e58dd2caecfba78786ab..4fcef3813b792808414415fa874e14f5ef253fcd 100644
--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -573,6 +573,22 @@ template void conv_im2col_gemm_int8<float>(const int8_t* i_data,
                                            ARMContext* ctx,
                                            const float* scale);
 
+template void im2col<float>(const float* data_im,
+                            int channels,
+                            int height,
+                            int width,
+                            int kernel_h,
+                            int kernel_w,
+                            int pad_top,
+                            int pad_bottom,
+                            int pad_left,
+                            int pad_right,
+                            int stride_h,
+                            int stride_w,
+                            int dilation_h,
+                            int dilation_w,
+                            float* data_col);
+
 void conv_depthwise_3x3_fp32(const void* din,
                              void* dout,
                              int num,
@@ -613,6 +629,26 @@ void conv_depthwise_3x3_fp32(const void* din,
                                 act_param,
                                 ctx);
     } else {
+#ifdef __aarch64__
+      conv_3x3s1_depthwise_fp32(reinterpret_cast<const float*>(din),
+                                reinterpret_cast<float*>(dout),
+                                num,
+                                ch_out,
+                                h_out,
+                                w_out,
+                                ch_in,
+                                h_in,
+                                w_in,
+                                reinterpret_cast<const float*>(weights),
+                                bias,
+                                param,
+                                act_param,
+                                ctx);
+#else
+#ifdef LITE_WITH_ARM_CLANG
+      LOG(FATAL) << "fp32 depthwise conv3x3s1px doesnot support in v7-clang, "
+                    "this can run in basic";
+#else
       conv_3x3s1_depthwise_fp32(reinterpret_cast<const float*>(din),
                                 reinterpret_cast<float*>(dout),
                                 num,
@@ -627,6 +663,8 @@ void conv_depthwise_3x3_fp32(const void* din,
                                 param,
                                 act_param,
                                 ctx);
+#endif
+#endif
     }
   } else if (stride == 2) {
     if (pads_less && pad_h == pad_w && (pad < 2)) {  // support pad = [0, 1]
diff --git a/lite/backends/arm/math/conv_impl.h b/lite/backends/arm/math/conv_impl.h
index 60f74b7feecc91a2fe8262a1fea4dce26430031d..28a2fb7e2a42a27e9ecd3d42b25f9942b481004e 100644
--- a/lite/backends/arm/math/conv_impl.h
+++ b/lite/backends/arm/math/conv_impl.h
@@ -359,6 +359,24 @@ void conv_compute_2x2_3x3_small(const float* input,
                                 const float* bias,
                                 const operators::ConvParam& param,
                                 ARMContext* ctx);
+
+template <typename Dtype>
+void im2col(const Dtype* data_im,
+            int channels,
+            int height,
+            int width,
+            int kernel_h,
+            int kernel_w,
+            int pad_top,
+            int pad_bottom,
+            int pad_left,
+            int pad_right,
+            int stride_h,
+            int stride_w,
+            int dilation_h,
+            int dilation_w,
+            Dtype* data_col);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/elementwise.cc b/lite/backends/arm/math/elementwise.cc
index 186ad19735799dcb91641354af4b4f09692bfce9..4d08c1e957d43b5b748ffdb90fd14a07a61d0183 100644
--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
@@ -266,6 +266,72 @@ void elementwise_add_relu_broadcast<float>(const float* dinx,
   }
 }
 
+template <>
+void elementwise_add_grad<float>(const float* dout_grad,
+                                 float* x_grad,
+                                 int num) {
+  int cnt = num >> 4;
+  int remain = num & 0x0f;
+#pragma omp parallel for
+  for (int i = 0; i < cnt; ++i) {
+    const float* out_data = dout_grad + 16 * i;
+    float* x_data = x_grad + 16 * i;
+    float32x4_t din0 = vld1q_f32(out_data);
+    float32x4_t din1 = vld1q_f32(out_data + 4);
+    float32x4_t din2 = vld1q_f32(out_data + 8);
+    float32x4_t din3 = vld1q_f32(out_data + 12);
+    vst1q_f32(x_data, din0);
+    vst1q_f32(x_data + 4, din1);
+    vst1q_f32(x_data + 8, din2);
+    vst1q_f32(x_data + 12, din3);
+  }
+  if (remain > 0) {
+    const float* out_data = dout_grad + 16 * cnt;
+    float* x_data = x_grad + 16 * cnt;
+    for (int i = 0; i < remain; ++i) {
+      x_data[i] = out_data[i];
+    }
+  }
+}
+// we assume that y_data numel less than x_data, otherwise, call this function
+// by change x_grad and y_grad position
+template <>
+void elementwise_add_grad_broadcast<float>(const float* dout_grad,
+                                           float* x_grad,
+                                           float* y_grad,
+                                           int pre,
+                                           int n,
+                                           int post) {
+  if (x_grad != nullptr) {
+    elementwise_add_grad(dout_grad, x_grad, pre * n * post);
+  }
+  if (y_grad != nullptr) {
+    memset(y_grad, 0, n * sizeof(float));
+#pragma omp parallel for
+    for (int i = 0; i < pre; ++i) {
+      for (int j = 0; j < n; ++j) {
+        float sum = 0;
+        int cnt = post >> 2;
+        int remain = post & 0x03;
+        const float* out_data = dout_grad + (i * n + j) * post;
+        float32x4_t sum_v = vdupq_n_f32(0);
+        for (int ci = 0; ci < cnt; ++ci) {
+          float32x4_t din = vld1q_f32(out_data + 4 * ci);
+          sum_v = vaddq_f32(sum_v, din);
+        }
+        out_data += 4 * cnt;
+        for (int ci = 0; ci < remain; ++ci) {
+          sum += out_data[ci];
+        }
+        float32x2_t high = vget_high_f32(sum_v);
+        float32x2_t low = vget_low_f32(sum_v);
+        sum += vget_lane_f32(high, 0) + vget_lane_f32(high, 1) +
+               vget_lane_f32(low, 0) + vget_lane_f32(low, 1);
+        y_grad[j] += sum;
+      }
+    }
+  }
+}
 template <>
 void elementwise_sub<float>(const float* dinx,
                             const float* diny,
@@ -510,6 +576,84 @@ void elementwise_sub_relu_broadcast<float>(const float* dinx,
     }
   }
 }
+// we assume the formula is x-y
+template <>
+void elementwise_sub_grad<float>(const float* dout_grad,
+                                 float* x_grad,
+                                 float* y_grad,
+                                 int num) {
+  if (x_grad != nullptr) {
+    elementwise_add_grad(dout_grad, x_grad, num);
+  }
+  if (y_grad != nullptr) {
+    int cnt = num >> 4;
+    int remain = num & 0x0f;
+    float32x4_t minus = vdupq_n_f32(-1);
+#pragma omp parallel for
+    for (int i = 0; i < cnt; ++i) {
+      const float* out_data = dout_grad + 16 * i;
+      float* y_data = y_grad + 16 * i;
+      float32x4_t din0 = vld1q_f32(out_data);
+      float32x4_t din1 = vld1q_f32(out_data + 4);
+      float32x4_t din2 = vld1q_f32(out_data + 8);
+      float32x4_t din3 = vld1q_f32(out_data + 12);
+      din0 = vmulq_f32(din0, minus);
+      din1 = vmulq_f32(din1, minus);
+      din2 = vmulq_f32(din2, minus);
+      din3 = vmulq_f32(din3, minus);
+      vst1q_f32(y_data, din0);
+      vst1q_f32(y_data + 4, din1);
+      vst1q_f32(y_data + 8, din2);
+      vst1q_f32(y_data + 12, din3);
+    }
+    if (remain > 0) {
+      const float* out_data = dout_grad + 16 * cnt;
+      float* y_data = y_grad + 16 * cnt;
+      for (int i = 0; i < remain; ++i) {
+        y_data[i] = -out_data[i];
+      }
+    }
+  }
+}
+// we assume that y_data numel less than x_data, otherwise, call this function
+// by change x_grad and y_grad position
+template <>
+void elementwise_sub_grad_broadcast<float>(const float* dout_grad,
+                                           float* x_grad,
+                                           float* y_grad,
+                                           int pre,
+                                           int n,
+                                           int post) {
+  if (x_grad != nullptr) {
+    elementwise_add_grad(dout_grad, x_grad, pre * n * post);
+  }
+  if (y_grad != nullptr) {
+    memset(y_grad, 0, n * sizeof(float));
+#pragma omp parallel for
+    for (int i = 0; i < pre; ++i) {
+      for (int j = 0; j < n; ++j) {
+        float sum = 0;
+        int cnt = post << 2;
+        int remain = post & 0x03;
+        const float* out_data = dout_grad + (i * n + j) * post;
+        float32x4_t sum_v = vdupq_n_f32(0);
+        for (int ci = 0; ci < cnt; ++ci) {
+          float32x4_t din = vld1q_f32(out_data + 4 * ci);
+          sum_v = vaddq_f32(sum_v, din);
+        }
+        out_data += 4 * cnt;
+        for (int ci = 0; ci < remain; ++ci) {
+          sum -= out_data[ci];
+        }
+        float32x2_t high = vget_high_f32(sum_v);
+        float32x2_t low = vget_low_f32(sum_v);
+        sum -= vget_lane_f32(high, 0) + vget_lane_f32(high, 1) +
+               vget_lane_f32(low, 0) + vget_lane_f32(low, 1);
+        y_grad[j] += sum;
+      }
+    }
+  }
+}
 
 template <>
 void elementwise_mul<float>(const float* dinx,
diff --git a/lite/backends/arm/math/elementwise.h b/lite/backends/arm/math/elementwise.h
index f8273a5bb39505b03e911b5699cc10c5be755619..06ecab08edcaf06614de94b99084be2ee80647aa 100644
--- a/lite/backends/arm/math/elementwise.h
+++ b/lite/backends/arm/math/elementwise.h
@@ -13,11 +13,161 @@
 // limitations under the License.
 
 #pragma once
-
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "lite/operators/op_params.h"
 namespace paddle {
 namespace lite {
 namespace arm {
 namespace math {
+template <typename T>
+void elementwise_broadcast_common(T const* x_data,
+                                  T const* y_data,
+                                  T* out_data,
+                                  std::vector<int64_t> x_real_dim,
+                                  std::vector<int64_t> y_real_dim,
+                                  std::vector<int64_t> out_real_dim,
+                                  std::string type,
+                                  bool is_xsize_large = false) {
+  int out_size = 1;
+  int max_dim = out_real_dim.size();
+  std::vector<int> index_array(max_dim, 0);
+  for (int i = 0; i < max_dim; ++i) {
+    out_size *= out_real_dim[i];
+  }
+  int x_index, y_index;
+  for (int out_index = 0; out_index < out_size; ++out_index) {
+    x_index = 0;
+    for (int i = 0; i < max_dim; i++) {
+      if (x_real_dim[i] > 1) {
+        x_index = x_index * x_real_dim[i] + index_array[i];
+      }
+    }
+    y_index = 0;
+    for (int i = 0; i < max_dim; i++) {
+      if (y_real_dim[i] > 1) {
+        y_index = y_index * y_real_dim[i] + index_array[i];
+      }
+    }
+
+    if (type == "add") {
+      out_data[out_index] = x_data[x_index] + y_data[y_index];
+    }
+    if (type == "mul") {
+      out_data[out_index] = x_data[x_index] * y_data[y_index];
+    }
+  }
+  for (int i = max_dim - 1; i >= 0; --i) {
+    ++index_array[i];
+    if (index_array[i] >= out_real_dim[i]) {
+      index_array[i] -= out_real_dim[i];
+    } else {
+      break;
+    }
+  }
+}
+template <typename dtype>
+void elementwise_compute_basic(const operators::ElementwiseParam& param,
+                               const std::string elt_type,
+                               const std::string act_type) {
+  const dtype* x_data = param.X->data<const dtype>();
+  const dtype* y_data = param.Y->data<const dtype>();
+  dtype* out_data = param.Out->mutable_data<dtype>();
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int batch = 1;
+  int channels = 1;
+  int num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    num *= x_dims[i];
+  }
+  // do elementwise add/sub/max...
+  if (elt_type == "add") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr + diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else if (elt_type == "sub") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr - diny_data;
+          dout_ptr++;
+        }
+      }
+    }
+  } else if (elt_type == "mul") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr * diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else if (elt_type == "max") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = std::max(*din_ptr, diny_data);
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else {
+    LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
+  }
+  // do activation relu/sigmod...
+  if (act_type.size() > 0) {
+    if (act_type == "relu") {
+      for (int i = 0; i < batch; ++i) {
+        for (int j = 0; j < channels; ++j) {
+          dtype* dout_ptr = out_data + (i * channels + j) * num;
+          for (int k = 0; k < num; ++k) {
+            *dout_ptr = *dout_ptr > 0.0f ? *dout_ptr : 0.0f;
+            dout_ptr++;
+          }
+        }
+      }
+    } else {
+      LOG(FATAL) << "unsupported Activation type: " << elt_type;
+    }
+  }
+}
 
 template <typename T>
 void elementwise_add(const T* dinx, const T* diny, T* dout, int num);
@@ -33,6 +183,13 @@ template <typename T>
 void elementwise_add_relu_broadcast(
     const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
 
+template <typename T>
+void elementwise_add_grad(const T* dout, T* dinx, int num);
+
+template <typename T>
+void elementwise_add_grad_broadcast(
+    const T* dout_grad, T* x_grad, T* y_grad, int pre, int n, int post);
+
 template <typename T>
 void elementwise_sub(const T* dinx, const T* diny, T* dout, int num);
 
@@ -47,6 +204,13 @@ template <typename T>
 void elementwise_sub_relu_broadcast(
     const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
 
+template <typename T>
+void elementwise_sub_grad(const T* dout, T* dinx, T* diny, int num);
+
+template <typename T>
+void elementwise_sub_grad_broadcast(
+    const T* dout_grad, T* x_grad, T* y_grad, int pre, int n, int post);
+
 template <typename T>
 void elementwise_mul(const T* dinx, const T* diny, T* dout, int num);
 
diff --git a/lite/backends/arm/math/gemm_prepacked_int8.cc b/lite/backends/arm/math/gemm_prepacked_int8.cc
index d7e04bfc60b1214bd1e77738efa420d3e25e1456..08f88105e052322e13390b7482fed7d8dd15089b 100644
--- a/lite/backends/arm/math/gemm_prepacked_int8.cc
+++ b/lite/backends/arm/math/gemm_prepacked_int8.cc
@@ -572,6 +572,25 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
 #define GEMM_INT8_INT8_OUT                                         \
   GEMM_TRANS_INT32_TO_FP32                                         \
   GEMM_INT8_RELU                                                   \
+  "ld1    {v8.4s},   [%[vmax]] \n"          /* v8 = -127 */        \
+  /* data >= -127 */                                               \
+  "fcmge v0.4s, v16.4s, v8.4s\n"                                   \
+  "fcmge v1.4s, v17.4s, v8.4s\n"                                   \
+  "fcmge v2.4s, v18.4s, v8.4s\n"                                   \
+  "fcmge v3.4s, v19.4s, v8.4s\n"                                   \
+  "fcmge v4.4s, v20.4s, v8.4s\n"                                   \
+  "fcmge v5.4s, v21.4s, v8.4s\n"                                   \
+  "fcmge v6.4s, v22.4s, v8.4s\n"                                   \
+  "fcmge v7.4s, v23.4s, v8.4s\n"                                   \
+  /* choose data */                                                \
+  "bif v16.16b, v8.16b, v0.16b            \n"                      \
+  "bif v17.16b, v8.16b, v1.16b            \n"                      \
+  "bif v18.16b, v8.16b, v2.16b            \n"                      \
+  "bif v19.16b, v8.16b, v3.16b            \n"                      \
+  "bif v20.16b, v8.16b, v4.16b            \n"                      \
+  "bif v21.16b, v8.16b, v5.16b            \n"                      \
+  "bif v22.16b, v8.16b, v6.16b            \n"                      \
+  "bif v23.16b, v8.16b, v7.16b            \n"                      \
   "fcvtas v0.4s, v16.4s\n"        /*  00, cvt to int */            \
   "fcvtas v1.4s, v17.4s\n"        /*  01, cvt to int */            \
   "fcvtas v2.4s, v18.4s\n"        /*  02, cvt to int */            \
@@ -580,6 +599,24 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
   "fcvtas v5.4s, v21.4s\n"        /*  11, cvt to int */            \
   "fcvtas v6.4s, v22.4s\n"        /*  12, cvt to int */            \
   "fcvtas v7.4s, v23.4s\n"        /*  13, cvt to int */            \
+  /* data >= -127 */                                               \
+  "fcmge v16.4s, v24.4s, v8.4s\n"                                   \
+  "fcmge v17.4s, v25.4s, v8.4s\n"                                   \
+  "fcmge v18.4s, v26.4s, v8.4s\n"                                   \
+  "fcmge v19.4s, v27.4s, v8.4s\n"                                   \
+  "fcmge v20.4s, v28.4s, v8.4s\n"                                   \
+  "fcmge v21.4s, v29.4s, v8.4s\n"                                   \
+  "fcmge v22.4s, v30.4s, v8.4s\n"                                   \
+  "fcmge v23.4s, v31.4s, v8.4s\n"                                   \
+  /* choose data */                                                \
+  "bif v24.16b, v8.16b, v16.16b\n"                                  \
+  "bif v25.16b, v8.16b, v17.16b\n"                                  \
+  "bif v26.16b, v8.16b, v18.16b\n"                                  \
+  "bif v27.16b, v8.16b, v19.16b\n"                                  \
+  "bif v28.16b, v8.16b, v20.16b\n"                                  \
+  "bif v29.16b, v8.16b, v21.16b\n"                                  \
+  "bif v30.16b, v8.16b, v22.16b\n"                                  \
+  "bif v31.16b, v8.16b, v23.16b\n"                                  \
   "sqxtn  v16.4h, v0.4s\n"        /*  00, cvt int32 to int16 */    \
   "fcvtas v8.4s, v24.4s\n"        /*  20, cvt to int */            \
   "sqxtn2 v16.8h, v1.4s\n"        /*  01, cvt int32 to int16 */    \
@@ -648,7 +685,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
                  "v9","v10","v11","v12","v13","v14",
                  "v15","v16","v17","v18","v19","v20",
                  "v21","v22","v23","v24","v25","v26",
-                 "v27","v28","v29","v30","v31","cc");
+                 "v27","v28","v29","v30","v31","cc", "memory");
   // clang-format on
 }
 
@@ -665,6 +702,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
                              int k,
                              int rem) {
   // clang-format off
+  float vmax[4] = {-127.0, -127.0, -127.0, -127.0};
   asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT8_OUT
                : [a_ptr] "+r"(a_ptr),
                  [b_ptr] "+r"(b_ptr),
@@ -676,13 +714,14 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
                : [is_relu] "r"(is_relu),
                  [bias] "r"(bias),
                  [rem] "r"(rem),
-                 [scale] "r"(scale)
+                 [scale] "r"(scale),
+                 [vmax] "r"(vmax)
                : "v0","v1","v2","v3","v4","v5","v6","v7",
                  "v8","v9","v10","v11","v12",
                  "v13","v14","v15","v16","v17",
                  "v18","v19","v20","v21","v22",
                  "v23","v24","v25","v26","v27",
-                 "v28","v29","v30","v31","cc");
+                 "v28","v29","v30","v31","cc", "memory");
   // clang-format on
 }
 
@@ -1179,6 +1218,25 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
 #define GEMM_SDOT_INT8_OUT                                      \
   GEMM_SDOT_CVT_INT32_TO_FP32                                   \
   GEMM_SDOT_RELU                                                \
+  "ld1  {v6.4s}, [%[vmax]]\n"     /* v8 = -127.f     */            \
+  /* data >= -127 */                                               \
+  "fcmge v0.4s, v8.4s, v6.4s\n"                                   \
+  "fcmge v1.4s, v9.4s, v6.4s\n"                                   \
+  "fcmge v2.4s, v10.4s, v6.4s\n"                                   \
+  "fcmge v3.4s, v11.4s, v6.4s\n"                                   \
+  "fcmge v4.4s, v12.4s, v6.4s\n"                                   \
+  "fcmge v5.4s, v13.4s, v6.4s\n"                                   \
+  "fcmge v7.4s, v14.4s, v6.4s\n"                                   \
+  /* choose data */                                                \
+  "bif v8.16b, v6.16b, v0.16b\n"                                  \
+  "fcmge v0.4s, v15.4s, v6.4s\n"                                   \
+  "bif v9.16b, v6.16b, v1.16b\n"                                  \
+  "bif v10.16b, v6.16b, v2.16b\n"                                 \
+  "bif v11.16b, v6.16b, v3.16b\n"                                  \
+  "bif v12.16b, v6.16b, v4.16b\n"                                  \
+  "bif v13.16b, v6.16b, v5.16b\n"                                  \
+  "bif v14.16b, v6.16b, v7.16b\n"                                  \
+  "bif v15.16b, v6.16b, v0.16b \n"                                 \
   "fcvtas v0.4s, v8.4s\n"         /*  00, cvt to int */         \
   "fcvtas v1.4s, v9.4s\n"         /*  01, cvt to int */         \
   "fcvtas v2.4s, v10.4s\n"        /*  02, cvt to int */         \
@@ -1194,7 +1252,30 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
   "sqxtn2 v12.8h, v4.4s\n"        /*  11, cvt int32 to int16 */ \
   "sqxtn  v13.4h, v5.4s\n"        /*  12, cvt int32 to int16 */ \
   "sqxtn  v14.4h, v6.4s\n"        /*  20, cvt int32 to int16 */ \
+  "ld1  {v6.4s}, [%[vmax]]\n"     /* v8 = -127.f     */            \
   "sqxtn2 v14.8h, v7.4s\n"        /*  21, cvt int32 to int16 */ \
+  /* data >= -127 */                                               \
+  "fcmge v0.4s, v16.4s, v6.4s\n"                                   \
+  "fcmge v1.4s, v17.4s, v6.4s\n"                                   \
+  "fcmge v2.4s, v18.4s, v6.4s\n"                                   \
+  "fcmge v3.4s, v19.4s, v6.4s\n"                                   \
+  "fcmge v4.4s, v20.4s, v6.4s\n"                                   \
+  "fcmge v5.4s, v21.4s, v6.4s\n"                                   \
+  "fcmge v7.4s, v22.4s, v6.4s\n"                                   \
+  "fcmge v8.4s, v23.4s, v6.4s\n"                                   \
+  "fcmge v9.4s, v24.4s, v6.4s\n"                                   \
+  /* choose data */                                                \
+  "bif v16.16b, v6.16b, v0.16b\n"                                  \
+  "fcmge v0.4s, v25.4s, v6.4s\n"                                   \
+  "bif v17.16b, v6.16b, v1.16b\n"                                  \
+  "bif v18.16b, v6.16b, v2.16b\n"                                  \
+  "bif v19.16b, v6.16b, v3.16b\n"                                  \
+  "bif v20.16b, v6.16b, v4.16b\n"                                  \
+  "bif v21.16b, v6.16b, v5.16b\n"                                  \
+  "bif v22.16b, v6.16b, v7.16b\n"                                  \
+  "bif v23.16b, v6.16b, v8.16b\n"                                  \
+  "bif v24.16b, v6.16b, v9.16b\n"                                  \
+  "bif v25.16b, v6.16b, v0.16b\n"                                  \
   "fcvtas v0.4s, v16.4s\n"        /*  22, cvt to int */         \
   "fcvtas v1.4s, v17.4s\n"        /*  30, cvt to int */         \
   "fcvtas v2.4s, v18.4s\n"        /*  31, cvt to int */         \
@@ -1214,7 +1295,22 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
   "sqxtn  v19.4h, v6.4s\n"        /*  42, cvt int32 to int16 */ \
   "sqxtn  v20.4h, v7.4s\n"        /*  50, cvt int32 to int16 */ \
   "sqxtn2 v20.8h, v8.4s\n"        /*  51, cvt int32 to int16 */ \
+  "ld1  {v6.4s}, [%[vmax]]\n"     /* v8 = -127.f     */            \
   "sqxtn  v21.4h, v9.4s\n"        /*  52, cvt int32 to int16 */ \
+  /* data >= -127 */                                               \
+  "fcmge v0.4s, v26.4s, v6.4s\n"                                   \
+  "fcmge v1.4s, v27.4s, v6.4s\n"                                   \
+  "fcmge v2.4s, v28.4s, v6.4s\n"                                   \
+  "fcmge v3.4s, v29.4s, v6.4s\n"                                   \
+  "fcmge v4.4s, v30.4s, v6.4s\n"                                   \
+  "fcmge v5.4s, v31.4s, v6.4s\n"                                   \
+  /* choose data */                                                \
+  "bif v26.16b, v6.16b, v0.16b\n"                                  \
+  "bif v27.16b, v6.16b, v1.16b\n"                                  \
+  "bif v28.16b, v6.16b, v2.16b\n"                                  \
+  "bif v29.16b, v6.16b, v3.16b\n"                                  \
+  "bif v30.16b, v6.16b, v4.16b\n"                                  \
+  "bif v31.16b, v6.16b, v5.16b\n"                                  \
   "fcvtas v0.4s, v26.4s\n"        /*  60, cvt to int */         \
   "fcvtas v1.4s, v27.4s\n"        /*  61, cvt to int */         \
   "fcvtas v2.4s, v28.4s\n"        /*  62, cvt to int */         \
@@ -1318,6 +1414,7 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
                                   int k,
                                   int tail) {
   // clang-format off
+  float32_t vmax[4] = {-127.0, -127.0, -127.0, -127.0};
   asm volatile(GEMM_SDOT_INT8_KERNEL GEMM_SDOT_INT8_OUT
                : [a_ptr] "+r"(a_ptr),
                  [b_ptr] "+r"(b_ptr),
@@ -1331,7 +1428,7 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
                  [c_ptr5] "+r"(c_ptr5),
                  [c_ptr6] "+r"(c_ptr6),
                  [c_ptr7] "+r"(c_ptr7)
-               : [bias_ptr] "r"(bias), [scale] "r"(scale), [relu] "r"(is_relu)
+               : [bias_ptr] "r"(bias), [scale] "r"(scale), [relu] "r"(is_relu), [vmax] "r"(vmax)
                : "cc","memory","v0","v1","v2","v3",
                  "v4","v5","v6","v7","v8","v9","v10",
                  "v11","v12","v13","v14","v15","v16","v17",
@@ -1614,6 +1711,24 @@ inline void gemm_sdot_int8_kernel(const int8_t* a_ptr,
   "vadd.f32 q3, q11, q3\n"   /* r21, add offset */     \
   "vadd.f32 q4, q12, q4\n"   /* r30, add offset */     \
   "vadd.f32 q5, q13, q5\n"   /* r31, add offset */     \
+  "vld1.32 {d12-d13}, [%[vmax]]\n" /* set q4 = -127 \n"*/   \
+  "vcge.f32 q7, q8, q6\n"   /* @ q8 >= -127 \n */     \
+  "vcge.f32 q10, q9, q6\n"   /* @ q8 >= -127 \n */     \
+  "vcge.f32 q11, q0, q6\n"   /* @ q8 >= -127 \n */     \
+  "vcge.f32 q12, q1, q6\n"   /* @ q8 >= -127 \n */     \
+  "vcge.f32 q13, q2, q6\n"   /* @ q8 >= -127 \n */     \
+  "vcge.f32 q14, q3, q6\n"   /* @ q8 >= -127 \n */     \
+  "vcge.f32 q15, q4, q6\n"   /* @ q8 >= -127 \n */     \
+  /* choose data */                                    \
+  "vbif q8, q6, q7\n"       /* @ choose */            \
+  "vcge.f32 q7, q5, q6\n"   /* @ q8 >= -127 \n */     \
+  "vbif q9, q6, q10\n"       /* @ choose */             \
+  "vbif q0, q6, q11\n"       /* @ choose */           \
+  "vbif q1, q6, q12\n"       /* @ choose */           \
+  "vbif q2, q6, q13\n"       /* @ choose */           \
+  "vbif q3, q6, q14\n"       /* @ choose */           \
+  "vbif q4, q6, q15\n"       /* @ choose */           \
+  "vbif q5, q6, q7\n"       /* @ choose */           \
   "vcvt.s32.f32   q6, q8\n"  /* r00, fp32->int32 */    \
   "vcvt.s32.f32   q7, q9\n"  /* r01, fp32->int32 */    \
   "vcvt.s32.f32   q10, q0\n" /* r10, fp32->int32 */    \
@@ -1682,7 +1797,8 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
                  "q14",
                  "q15",
                  "r0",
-                 "cc");
+                 "cc",
+                 "memory");
 }
 
 template <>
@@ -1697,6 +1813,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
                              bool is_relu,
                              int k,
                              int rem) {
+  float vmax[4] = {-127.0, -127.0, -127.0, -127.0};
   asm volatile(GEMM_INT8_KERNEL GEMM_INT8_INT8_OUT
                : [a_ptr] "+r"(a_ptr),
                  [b_ptr] "+r"(b_ptr),
@@ -1708,6 +1825,7 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
                : [is_relu] "r"(is_relu),
                  [bias] "r"(bias),
                  [rem] "r"(rem),
+                 [vmax] "r"(vmax),
                  [scale] "r"(scale)
                : "q0",
                  "q1",
@@ -1726,7 +1844,8 @@ inline void gemm_int8_kernel(const int8_t* a_ptr,
                  "q14",
                  "q15",
                  "r0",
-                 "cc");
+                 "cc",
+                 "memory");
 }
 #endif  // __aarch64__ // NOLINT
 
diff --git a/lite/backends/arm/math/gemv_arm_int8.cc b/lite/backends/arm/math/gemv_arm_int8.cc
index dab42cdeca28d40622590632985603ce8eab1fb9..98c50de9e370fbe39c35156bf631b35362ff21b4 100644
--- a/lite/backends/arm/math/gemv_arm_int8.cc
+++ b/lite/backends/arm/math/gemv_arm_int8.cc
@@ -79,6 +79,7 @@ inline void write_gemv_out(const int* in,
     for (int i = 0; i < size; ++i) {
       out[0] =
           saturate_cast<signed char>(roundf(*(in++) * *(scale++) + *(bias++)));
+      out[0] = out[0] < -127 ? -127 : out[0];  // -127 - 127
       if (flag_relu) {
         out[0] = out[0] > 0 ? out[0] : 0;
       }
@@ -87,6 +88,7 @@ inline void write_gemv_out(const int* in,
   } else {
     for (int i = 0; i < size; ++i) {
       out[0] = saturate_cast<signed char>(roundf(*(in++) * *(scale++)));
+      out[0] = out[0] < -127 ? -127 : out[0];  // -127 - 127
       if (flag_relu) {
         out[0] = out[0] > 0 ? out[0] : 0;
       }
diff --git a/lite/backends/arm/math/increment.cc b/lite/backends/arm/math/increment.cc
index 583ff52077e720510e66fcdb9604d1dc8992a90d..62c4f41eacda0356ca3967af877244856b3156d7 100644
--- a/lite/backends/arm/math/increment.cc
+++ b/lite/backends/arm/math/increment.cc
@@ -20,18 +20,7 @@
 namespace paddle {
 namespace lite {
 namespace arm {
-namespace math {
-void increment(const float* input,
-               const int n,
-               const float step,
-               float* out,
-               Context<TARGET(kARM)>* ctx) {
-  for (int i = 0; i < n; i++) {
-    out[i] = input[i] + step;
-  }
-}
-
-}  // namespace math
+namespace math {}  // namespace math
 }  // namespace arm
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/arm/math/increment.h b/lite/backends/arm/math/increment.h
index 028db0fd55e9507aa4f161339e4a8b0cd2e59ffe..ec6217d105bb73b5ab230518876471af91880d2d 100644
--- a/lite/backends/arm/math/increment.h
+++ b/lite/backends/arm/math/increment.h
@@ -21,11 +21,16 @@ namespace paddle {
 namespace lite {
 namespace arm {
 namespace math {
-void increment(const float* input,
+template <typename T>
+void increment(const T* input,
                const int n,
                const float step,
-               float* out,
-               Context<TARGET(kARM)>* ctx);
+               T* out,
+               Context<TARGET(kARM)>* ctx) {
+  for (int i = 0; i < n; i++) {
+    out[i] = input[i] + static_cast<T>(step);
+  }
+}
 
 }  // namespace math
 }  // namespace arm
diff --git a/lite/backends/arm/math/layout.cc b/lite/backends/arm/math/layout.cc
index fd9126ab48c8f829c82d0c78a338074c695f0b9c..214c386d553e3d5548bb4750c3130191a650830f 100644
--- a/lite/backends/arm/math/layout.cc
+++ b/lite/backends/arm/math/layout.cc
@@ -358,6 +358,8 @@ void NCHW2NHWC<int8_t>(int N, int C, int size, const int8_t* X, int8_t* Y) {
                        "v14",
                        "v15");
 #else
+#if 0  // TOOD(ysh329): caused assembly code error with register for armv7
+       // **clang** compile
         asm volatile(TRANS_C8
                      : [din0_ptr] "+r"(din0_ptr),
                        [din1_ptr] "+r"(din1_ptr),
@@ -375,6 +377,7 @@ void NCHW2NHWC<int8_t>(int N, int C, int size, const int8_t* X, int8_t* Y) {
                        [stride_w] "+r"(stride_w)
                      :
                      : "cc", "memory", "q0", "q1", "q2", "q3");
+#endif
 #endif
       }
       // const int8_t* din_ptr = din + 8 * cnt * size + s; // remain channel
@@ -478,6 +481,8 @@ void NHWC2NCHW<float>(int N, int C, int size, const float* X, float* Y) {
                        "v10",
                        "v11");
 #else
+#if 0  // TOOD(ysh329): caused assembly code error with register for armv7
+       // **clang** compile
         asm volatile(TRANS_C4
                      : [din0_ptr] "+r"(din0_ptr),
                        [din1_ptr] "+r"(din1_ptr),
@@ -491,6 +496,7 @@ void NHWC2NCHW<float>(int N, int C, int size, const float* X, float* Y) {
                        [stride] "+r"(stride)
                      :
                      : "cc", "memory", "q0", "q1", "q2", "q3");
+#endif
 #endif
       }
       for (int i = 0; i < remain; i++) {
@@ -593,6 +599,8 @@ void NHWC2NCHW<int8_t>(int N, int C, int size, const int8_t* X, int8_t* Y) {
                        "v14",
                        "v15");
 #else
+#if 0  // TOOD(ysh329): caused assembly code error with register for armv7
+       // **clang** compile
         asm volatile(TRANS_C8
                      : [din0_ptr] "+r"(din0_ptr),
                        [din1_ptr] "+r"(din1_ptr),
@@ -610,6 +618,7 @@ void NHWC2NCHW<int8_t>(int N, int C, int size, const int8_t* X, int8_t* Y) {
                        [stride_w] "+r"(stride_w)
                      :
                      : "cc", "memory", "q0", "q1", "q2", "q3");
+#endif
 #endif
       }
       for (int i = 0; i < remain; i++) {
diff --git a/lite/backends/arm/math/lstm.cc b/lite/backends/arm/math/lstm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5a2a263bb4fa2dc7b4ec54d84c698651a058f933
--- /dev/null
+++ b/lite/backends/arm/math/lstm.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/lstm.h"
+#include "lite/backends/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void add_bias_rowwise(Tensor* input,
+                      const Tensor* bias,
+                      int start_w,
+                      int end_w) {
+  auto in_dim = input->dims();
+  int width = input->numel() / in_dim[0];
+  int w_adds = width < end_w ? width : end_w;
+  float* i_data = input->mutable_data<float>();
+  const float* b_data = bias->data<float>();
+  for (int i = 0; i < in_dim[0]; ++i) {
+    for (int w = start_w; w < w_adds; ++w) {
+      i_data[w] += b_data[w];
+    }
+  }
+}
+void vector_dot(
+    float* out, const float* in, const float* v1, int size, const float* v2) {
+  int loop = size >> 2;
+  int remain = size & 3;
+  const float* in_ptr = in;
+  float* out_ptr = out;
+  const float* v1_ptr = v1;
+  const float* v2_ptr = v2;
+  for (int i = 0; i < loop; ++i) {
+    float32x4_t in = vld1q_f32(in_ptr);
+    float32x4_t data1 = vld1q_f32(v1_ptr);
+    if (!v2) {
+      // in_out * v1
+      float32x4_t out = vmulq_f32(in, data1);
+      vst1q_f32(out_ptr, out);
+      in_ptr += 4;
+      v1_ptr += 4;
+      out_ptr += 4;
+    } else {
+      // in_out + v1 * v2
+      float32x4_t data2 = vld1q_f32(v2_ptr);
+      float32x4_t out = vmlaq_f32(in, data1, data2);
+      vst1q_f32(out_ptr, out);
+      in_ptr += 4;
+      v1_ptr += 4;
+      out_ptr += 4;
+      v2_ptr += 4;
+    }
+  }
+  for (int i = 0; i < remain; ++i) {
+    if (!v2) {
+      out_ptr[i] = in_ptr[i] * v1_ptr[i];
+      ++out_ptr;
+      ++in_ptr;
+      ++v1_ptr;
+    } else {
+      out_ptr[i] = in_ptr[i] + v1_ptr[i] * v2_ptr[i];
+      ++out_ptr;
+      ++in_ptr;
+      ++v1_ptr;
+      ++v2_ptr;
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/lstm.h b/lite/backends/arm/math/lstm.h
new file mode 100644
index 0000000000000000000000000000000000000000..e04581b055a93ac09da5ec6d5d57263fa2ad6261
--- /dev/null
+++ b/lite/backends/arm/math/lstm.h
@@ -0,0 +1,137 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <arm_neon.h>
+#include <string>
+#include "lite/backends/arm/math/activation.h"
+#include "lite/core/tensor.h"
+#include "lite/utils/logging.h"
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void add_bias_rowwise(Tensor* input,
+                      const Tensor* bias,
+                      int start_w,
+                      int end_w);
+
+inline float* row_offset(Tensor& input, int start) {  // NOLINT
+  auto in_dim = input.dims();
+  int width = input.numel() / in_dim[0];
+  int offset = start < in_dim[0] ? start * width : input.numel();
+  return input.mutable_data<float>() + offset;
+}
+template <class T>
+struct LstmMetaValue {
+  T* gate_value;
+  T* prev_state_value;
+  T* state_value;
+  T* state_active_value;
+  T* output_value;
+  T* check_ig;
+  T* check_fg;
+  T* check_og;
+};
+
+template <typename T>
+void activation(
+    const T* din, T* dout, int size, std::string act_str, int threads) {
+  if (act_str == "sigmoid") {
+    act_sigmoid(din, dout, size, threads);
+  } else if (act_str == "tanh") {
+    act_tanh(din, dout, size, threads);
+  } else if (act_str == "relu") {
+    act_relu(din, dout, size, threads);
+  } else {
+    LOG(FATAL) << "unsupport activation " << act_str;
+  }
+}
+
+void vector_dot(float* out,
+                const float* in,
+                const float* v1,
+                int size,
+                const float* v2 = nullptr);
+
+template <typename T>
+struct LstmUnitFunctor {
+  static void compute(LstmMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      std::string gate_act,
+                      std::string cell_act,
+                      std::string cand_act,
+                      int threads) {
+    for (int b = 0; b < batch_size; ++b) {
+      const int temp_len = frame_size;
+      float zero_ptr[temp_len];  // NOLINT
+      memset(zero_ptr, 0, sizeof(float) * temp_len);
+
+      T* value_in = value.gate_value;
+      T* value_ig = value_in + frame_size;
+      T* value_fg = value_ig + frame_size;
+      T* value_og = value_fg + frame_size;
+      T* state = value.state_value;
+      T* state_act = value.state_active_value;
+      T* output = value.output_value;
+
+      T* check_i = value.check_ig ? value.check_ig : zero_ptr;
+      T* check_f = value.check_fg ? value.check_fg : zero_ptr;
+      T* check_o = value.check_og ? value.check_og : zero_ptr;
+      T* prev_state =
+          value.prev_state_value ? value.prev_state_value : zero_ptr;
+
+      activation(value_in, value_in, frame_size, gate_act, threads);
+      vector_dot(value_ig, value_ig, prev_state, frame_size, check_i);
+      vector_dot(value_fg, value_fg, prev_state, frame_size, check_f);
+      activation(value_ig, value_ig, frame_size, cell_act, threads);
+      activation(value_fg, value_fg, frame_size, cell_act, threads);
+      vector_dot(state, value_in, value_ig, frame_size);
+      vector_dot(state, state, prev_state, frame_size, value_fg);
+
+      for (int i = 0; i < frame_size; ++i) {
+        if (cell_clip > 0.0) {
+          if (state[i] < -1.0 * cell_clip) {
+            state[i] = -1.0 * cell_clip;
+          }
+          if (state[i] > cell_clip) {
+            state[i] = cell_clip;
+          }
+        }
+      }
+
+      vector_dot(value_og, value_og, state, frame_size, check_o);
+      activation(value_og, value_og, frame_size, cell_act, threads);
+      activation(state, state_act, frame_size, cand_act, threads);
+      vector_dot(value.output_value, value_og, state_act, frame_size);
+
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
+      }
+    }
+  }
+};
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/packed_sgemm.cc b/lite/backends/arm/math/packed_sgemm.cc
index cb9c049d81aee73b65bacd27a64138779d1532cc..b41afc1c29e121f905b0abc48bae98705bc0ee16 100644
--- a/lite/backends/arm/math/packed_sgemm.cc
+++ b/lite/backends/arm/math/packed_sgemm.cc
@@ -2289,6 +2289,29 @@ void sgemm_prepacked_8x12(bool is_transB,
   size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
   auto workspace = ctx->workspace_data<float>();
   int threads = ctx->threads();
+
+  auto act_type = act_param.active_type;
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  int flag_act = 0x00;  // relu: 1, relu6: 2, leakey: 3
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 0x01;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 0x02;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 0x03;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
   //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
   int x_block = (l2_cache - (MBLOCK * K)) / (sizeof(float) * (K + MBLOCK));
   x_block /= NBLOCK;
@@ -2837,7 +2860,172 @@ void sgemm_prepacked_8x12(bool is_transB,
             "fmla	v25.4s,  v4.4s,  v1.s[1]\n"   /* out21 = b2 * a10[0], b2 =q7*/
             "fmla	v28.4s,  v4.4s,  v1.s[2]\n"   /* out22 = b2 * a10[0], b2 =q7*/
             "fmla	v31.4s,  v4.4s,  v1.s[3]\n"   /* out23 = b2 * a10[0], b2 =q7*/
-            "11: \n"                            /* check if relu */
+
+            "11:                           \n"   /* check activation */
+            "cmp    %w[flag_act],   #1     \n"   /* check if has relu */
+            "bne    12f                    \n"   /* jump if no relu */
+            "movi   v0.4s,  #0             \n"   /* for relu*/
+            "fmax   v8.4s,  v8.4s,  v0.4s  \n"   /* relu*/
+            "fmax   v9.4s,  v9.4s,  v0.4s  \n"   /* relu*/
+            "fmax   v10.4s, v10.4s, v0.4s  \n"   /* relu*/
+            "fmax   v11.4s, v11.4s, v0.4s  \n"   /* relu*/
+            "fmax   v12.4s, v12.4s, v0.4s  \n"   /* relu*/
+            "fmax   v13.4s, v13.4s, v0.4s  \n"   /* relu*/
+            "fmax   v14.4s, v14.4s, v0.4s  \n"   /* relu*/
+            "fmax   v15.4s, v15.4s, v0.4s  \n"   /* relu*/
+            "fmax   v16.4s, v16.4s, v0.4s  \n"   /* relu*/
+            "fmax   v17.4s, v17.4s, v0.4s  \n"   /* relu*/
+            "fmax   v18.4s, v18.4s, v0.4s  \n"   /* relu*/
+            "fmax   v19.4s, v19.4s, v0.4s  \n"   /* relu*/
+            "fmax   v20.4s, v20.4s, v0.4s  \n"   /* relu*/
+            "fmax   v21.4s, v21.4s, v0.4s  \n"   /* relu*/
+            "fmax   v22.4s, v22.4s, v0.4s  \n"   /* relu*/
+            "fmax   v23.4s, v23.4s, v0.4s  \n"   /* relu*/
+            "fmax   v24.4s, v24.4s, v0.4s  \n"   /* relu*/
+            "fmax   v25.4s, v25.4s, v0.4s  \n"   /* relu*/
+            "fmax   v26.4s, v26.4s, v0.4s  \n"   /* relu*/
+            "fmax   v27.4s, v27.4s, v0.4s  \n"   /* relu*/
+            "fmax   v28.4s, v28.4s, v0.4s  \n"   /* relu*/
+            "fmax   v29.4s, v29.4s, v0.4s  \n"   /* relu*/
+            "fmax   v30.4s, v30.4s, v0.4s  \n"   /* relu*/
+            "fmax   v31.4s, v31.4s, v0.4s  \n"   /* relu*/
+            "b      20f                    \n"   /* relu end */
+            //! no act 
+            "12:                           \n"   /* no relu */
+            "cmp   %w[flag_act],  #0       \n"   /* check no act */
+            "beq   20f                     \n"   /* no act end */ 
+            //! relu6 
+            "cmp    %w[flag_act],  #2      \n"    /* check if has relu6 */
+            "bne    13f                    \n"    /* jump if no relu6 */
+            "movi   v0.4s, #0              \n"    /* for relu6 */
+            "ld1    {v1.4s}, [%[alpha]]    \n"    /* relu6 alpha */
+            "fmax   v8.4s,  v8.4s,  v0.4s  \n"    /* relu6 */
+            "fmax   v9.4s,  v9.4s,  v0.4s  \n"    /* relu6 */
+            "fmax   v10.4s, v10.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v11.4s, v11.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v12.4s, v12.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v13.4s, v13.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v14.4s, v14.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v15.4s, v15.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v16.4s, v16.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v17.4s, v17.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v18.4s, v18.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v19.4s, v19.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v20.4s, v20.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v21.4s, v21.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v22.4s, v22.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v23.4s, v23.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v24.4s, v24.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v25.4s, v25.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v26.4s, v26.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v27.4s, v27.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v28.4s, v28.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v29.4s, v29.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v30.4s, v30.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v31.4s, v31.4s, v0.4s  \n"    /* relu6 */
+            "fmin   v8.4s,  v8.4s,  v1.4s  \n"    /* relu6 */
+            "fmin   v9.4s,  v9.4s,  v1.4s  \n"    /* relu6 */
+            "fmin   v10.4s, v10.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v11.4s, v11.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v12.4s, v12.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v13.4s, v13.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v14.4s, v14.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v15.4s, v15.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v16.4s, v16.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v17.4s, v17.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v18.4s, v18.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v19.4s, v19.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v20.4s, v20.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v21.4s, v21.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v22.4s, v22.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v23.4s, v23.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v24.4s, v24.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v25.4s, v25.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v26.4s, v26.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v27.4s, v27.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v28.4s, v28.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v29.4s, v29.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v30.4s, v30.4s, v1.4s  \n"    /* relu6 */
+            "fmin   v31.4s, v31.4s, v1.4s  \n"    /* relu6 */
+            "b      20f                    \n"    /* relu6 end */
+            //! leakey relu
+            "13:                                \n" /* otherwise is leakey relu */
+            "movi   v0.4s,    #0                \n" /* for leakey relu */
+            "ld1    {v1.4s},  [%[alpha]]        \n" /* leakey relu alpha */
+            "fcmge  v2.4s,    v8.4s,    v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v3.4s,    v8.4s,    v1.4s   \n" /* vmulq_f32 */ 
+            "fcmge  v4.4s,    v9.4s,    v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v5.4s,    v9.4s,    v1.4s   \n" /* vmulq_f32 */ 
+            "fcmge  v6.4s,    v10.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v7.4s,    v10.4s,   v1.4s   \n" /* vmulq_f32 */ 
+            "bif    v8.16b,   v3.16b,   v2.16b  \n" /* choose*/     
+            "bif    v9.16b,   v5.16b,   v4.16b  \n" /* choose*/     
+            "bif    v10.16b,  v7.16b,   v6.16b  \n" /* choose*/
+            "fcmge  v2.4s,    v11.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v3.4s,    v11.4s,   v1.4s   \n" /* vmulq_f32 */      
+            "bif    v11.16b,  v3.16b,   v2.16b  \n" /* choose*/
+            "fcmge  v2.4s,    v12.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v3.4s,    v12.4s,   v1.4s   \n" /* vmulq_f32 */ 
+            "fcmge  v4.4s,    v13.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v5.4s,    v13.4s,   v1.4s   \n" /* vmulq_f32 */ 
+            "fcmge  v6.4s,    v14.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v7.4s,    v14.4s,   v1.4s   \n" /* vmulq_f32 */ 
+            "bif    v12.16b,  v3.16b,   v2.16b  \n" /* choose*/     
+            "bif    v13.16b,  v5.16b,   v4.16b  \n" /* choose*/     
+            "bif    v14.16b,  v7.16b,   v6.16b  \n" /* choose*/
+            "fcmge  v2.4s,    v15.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v3.4s,    v15.4s,   v1.4s   \n" /* vmulq_f32 */      
+            "bif    v15.16b,  v3.16b,   v2.16b  \n" /* choose*/
+            "fcmge  v2.4s,    v16.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v3.4s,    v16.4s,   v1.4s   \n" /* vmulq_f32 */ 
+            "fcmge  v4.4s,    v17.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v5.4s,    v17.4s,   v1.4s   \n" /* vmulq_f32 */ 
+            "fcmge  v6.4s,    v18.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v7.4s,    v18.4s,   v1.4s   \n" /* vmulq_f32 */ 
+            "bif    v16.16b,  v3.16b,   v2.16b  \n" /* choose*/     
+            "bif    v17.16b,  v5.16b,   v4.16b  \n" /* choose*/     
+            "bif    v18.16b,  v7.16b,   v6.16b  \n" /* choose*/
+            "fcmge  v2.4s,    v19.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v3.4s,    v19.4s,   v1.4s   \n" /* vmulq_f32 */      
+            "bif    v19.16b,  v3.16b,   v2.16b  \n" /* choose*/
+            "fcmge  v2.4s,    v20.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v3.4s,    v20.4s,   v1.4s   \n" /* vmulq_f32 */ 
+            "fcmge  v4.4s,    v21.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v5.4s,    v21.4s,   v1.4s   \n" /* vmulq_f32 */ 
+            "fcmge  v6.4s,    v22.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v7.4s,    v22.4s,   v1.4s   \n" /* vmulq_f32 */ 
+            "bif    v20.16b,  v3.16b,   v2.16b  \n" /* choose*/     
+            "bif    v21.16b,  v5.16b,   v4.16b  \n" /* choose*/     
+            "bif    v22.16b,  v7.16b,   v6.16b  \n" /* choose*/  
+            "fcmge  v2.4s,    v23.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v3.4s,    v23.4s,   v1.4s   \n" /* vmulq_f32 */    
+            "bif    v23.16b,  v3.16b,   v2.16b  \n" /* choose*/
+            "fcmge  v2.4s,    v24.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v3.4s,    v24.4s,   v1.4s   \n" /* vmulq_f32 */ 
+            "fcmge  v4.4s,    v25.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v5.4s,    v25.4s,   v1.4s   \n" /* vmulq_f32 */ 
+            "fcmge  v6.4s,    v26.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v7.4s,    v26.4s,   v1.4s   \n" /* vmulq_f32 */ 
+            "bif    v24.16b,  v3.16b,   v2.16b  \n" /* choose*/     
+            "bif    v25.16b,  v5.16b,   v4.16b  \n" /* choose*/     
+            "bif    v26.16b,  v7.16b,   v6.16b  \n" /* choose*/
+            "fcmge  v2.4s,    v27.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v3.4s,    v27.4s,   v1.4s   \n" /* vmulq_f32 */      
+            "bif    v27.16b,  v3.16b,   v2.16b  \n" /* choose*/
+            "fcmge  v2.4s,    v28.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v3.4s,    v28.4s,   v1.4s   \n" /* vmulq_f32 */ 
+            "fcmge  v4.4s,    v29.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v5.4s,    v29.4s,   v1.4s   \n" /* vmulq_f32 */ 
+            "fcmge  v6.4s,    v30.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v7.4s,    v30.4s,   v1.4s   \n" /* vmulq_f32 */ 
+            "bif    v28.16b,  v3.16b,   v2.16b  \n" /* choose*/     
+            "bif    v29.16b,  v5.16b,   v4.16b  \n" /* choose*/     
+            "bif    v30.16b,  v7.16b,   v6.16b  \n" /* choose*/
+            "fcmge  v2.4s,    v31.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v3.4s,    v31.4s,   v1.4s   \n" /* vmulq_f32 */      
+            "bif    v31.16b,  v3.16b,   v2.16b  \n" /* choose*/
+            "20:                                \n" /* act end */
+
             "st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48\n"   /* store r0 */
             "st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48\n" /* store r1 */
             "st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48\n" /* store r2 */
@@ -2861,7 +3049,9 @@ void sgemm_prepacked_8x12(bool is_transB,
               [c_ptr7] "+r"(c_ptr7)
             : [bias_ptr] "r"(bias_local),
               [has_beta] "r"(has_beta),
-              [beta] "r"(beta)
+              [beta] "r"(beta),
+              [alpha] "r"(alpha), 
+              [flag_act] "r"(flag_act)
             : "cc","memory",
               "v0","v1","v2","v3","v4","v5","v6","v7",
               "v8","v9","v10","v11","v12","v13",
@@ -2884,13 +3074,6 @@ void sgemm_prepacked_8x12(bool is_transB,
       }
     }
   }
-  if (act_param.has_active) {
-#pragma omp parallel for num_threads(threads)
-    for (unsigned int x = 0; x < M; x++) {
-      float *dst = C + x * ldc;
-      act_switch_process(dst, dst, N, &act_param);
-    }
-  }
 }
 
 void sgemm_prepacked_4x4(bool is_transB,
@@ -2911,6 +3094,28 @@ void sgemm_prepacked_4x4(bool is_transB,
   auto workspace = ctx->workspace_data<float>();
   int threads = ctx->threads();
 
+  auto act_type = act_param.active_type;
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  int flag_act = 0x00;  // relu: 1, relu6: 2, leakey: 3
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 0x01;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 0x02;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 0x03;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
   const int n_block = 4;
   const int m_block = 4;
   //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
@@ -3137,7 +3342,51 @@ void sgemm_prepacked_4x4(bool is_transB,
             "fmla	v10.4s,  v6.4s,  v2.s[2]\n"   /* out2 = b2 * a20[2], b1 =q6*/
             "fmla	v11.4s,  v6.4s,  v2.s[3]\n"   /* out3 = b2 * a20[3], b1 =q6*/
 
-            "11: \n"                            /* check if relu */
+            "11:                           \n"   /* check activation */
+            "cmp    %w[flag_act],   #1     \n"   /* check if has relu */
+            "bne    12f                    \n"   /* jump if no relu */
+            "movi   v0.4s,  #0             \n"   /* for relu*/
+            "fmax   v8.4s,  v8.4s,  v0.4s  \n"   /* relu*/
+            "fmax   v9.4s,  v9.4s,  v0.4s  \n"   /* relu*/
+            "fmax   v10.4s, v10.4s, v0.4s  \n"   /* relu*/
+            "fmax   v11.4s, v11.4s, v0.4s  \n"   /* relu*/
+            "b      20f                    \n"   /* relu end */
+            //! no act 
+            "12:                           \n"   /* no relu */
+            "cmp   %w[flag_act],  #0       \n"   /* check no act */
+            "beq   20f                     \n"   /* no act end */ 
+            //! relu6 
+            "cmp    %w[flag_act],  #2      \n"    /* check if has relu6 */
+            "bne    13f                    \n"    /* jump if no relu6 */
+            "movi   v0.4s, #0              \n"    /* for relu6 */
+            "ld1    {v1.4s}, [%[alpha]]    \n"    /* relu6 alpha */
+            "fmax   v8.4s,  v8.4s,  v0.4s  \n"    /* relu6 */
+            "fmax   v9.4s,  v9.4s,  v0.4s  \n"    /* relu6 */
+            "fmax   v10.4s, v10.4s, v0.4s  \n"    /* relu6 */
+            "fmax   v11.4s, v11.4s, v0.4s  \n"    /* relu6 */
+
+            "fmin   v8.4s,  v8.4s,  v1.4s  \n"    /* relu6*/
+            "fmin   v9.4s,  v9.4s,  v1.4s  \n"    /* relu6*/
+            "fmin   v10.4s, v10.4s, v1.4s  \n"    /* relu6*/
+            "fmin   v11.4s, v11.4s, v1.4s  \n"    /* relu6*/
+            "b      20f                    \n"    /* relu6 end */
+            //! leakey relu
+            "13:                                \n" /* otherwise is leakey relu */
+            "movi   v0.4s,    #0                \n" /* for leakey relu */
+            "ld1    {v1.4s},  [%[alpha]]        \n" /* leakey relu alpha */
+            "fcmge  v2.4s,    v8.4s,    v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v3.4s,    v8.4s,    v1.4s   \n" /* vmulq_f32 */ 
+            "fcmge  v4.4s,    v9.4s,    v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v5.4s,    v9.4s,    v1.4s   \n" /* vmulq_f32 */ 
+            "fcmge  v6.4s,    v10.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v7.4s,    v10.4s,   v1.4s   \n" /* vmulq_f32 */ 
+            "fcmge  v12.4s,   v11.4s,   v0.4s   \n" /* vcgeq_f32 */ 
+            "fmul   v13.4s,   v11.4s,   v1.4s   \n" /* vmulq_f32 */ 
+            "bif    v8.16b,   v3.16b,   v2.16b  \n" /* choose*/     
+            "bif    v9.16b,   v5.16b,   v4.16b  \n" /* choose*/     
+            "bif    v10.16b,  v7.16b,   v6.16b  \n" /* choose*/     
+            "bif    v11.16b,  v13.16b,  v12.16b \n" /* choose*/
+            "20:                                \n" /* act end */
             "st1 {v8.4s}, [%[c_ptr0]], #16\n"   /* store r0 */
             "st1 {v9.4s}, [%[c_ptr1]], #16\n" /* store r1 */
             "st1 {v10.4s}, [%[c_ptr2]], #16\n" /* store r2 */
@@ -3153,7 +3402,9 @@ void sgemm_prepacked_4x4(bool is_transB,
               [c_ptr3] "+r"(c_ptr3)
             : [bias_ptr] "r"(bias_local),
               [has_beta] "r"(has_beta),
-              [beta] "r"(beta)
+              [beta] "r"(beta), 
+              [alpha] "r"(alpha),
+              [flag_act] "r"(flag_act)
             : "cc","memory",
               "v0","v1","v2","v3","v4","v5","v6","v7",
               "v8","v9","v10","v11");
@@ -3169,13 +3420,6 @@ void sgemm_prepacked_4x4(bool is_transB,
       }
     }
   }
-  if (act_param.has_active) {
-#pragma omp parallel for num_threads(threads)
-    for (unsigned int x = 0; x < M; x++) {
-      float *dst = C + x * ldc;
-      act_switch_process(dst, dst, N, &act_param);
-    }
-  }
 }
 #else  // __aarch64__
 /**
@@ -3206,6 +3450,28 @@ void sgemm_prepacked_6x8(bool is_transB,
   size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
   auto* workspace = ctx->workspace_data<float>();
   int threads = ctx->threads();
+  auto act_type = act_param.active_type;
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  int flag_act = 0x00;  // relu: 1, relu6: 2, leakey: 3
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 0x01;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 0x02;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 0x03;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
   //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
   int x_block =
       (l2_cache - (MBLOCK_OTH * K)) / (sizeof(float) * (K + MBLOCK_OTH));
@@ -3223,6 +3489,8 @@ void sgemm_prepacked_6x8(bool is_transB,
     tail_pre = KBLOCK;
   }
 
+  //! merge tail_pre and flag_act
+  tail_pre = (tail_pre << 2 | flag_act);
   bool flag_p_remain = false;
   int remain = 0;
 
@@ -3456,13 +3724,14 @@ void sgemm_prepacked_6x8(bool is_transB,
             "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a0~a3\n"
             "vmla.f32	q9, q3, d2[0]               @ out8 += b2 * a2\n"
             "vmla.f32	q11, q3, d2[1]              @ out9 += b2 * a3\n"
-            "subs		%[k], %[k], #1              @ k--\n"
+            "subs		%[k], %[k], #1                @ k--\n"
             "vmla.f32	q13, q3, d3[0]              @ out10 += b2 * a4\n"
             "vmla.f32	q15, q3, d3[1]              @ out11 += b2 * a5\n"
-            "bne		1b                          @ jump to main loop\n"
-            "0:                                     @ process tail\n"
-            "subs		%[tails], %[tails], #1      @ tail--\n"
-            "beq		3f                          @ jump to tail = 1\n"
+            "bne		1b                            @ jump to main loop\n"
+            "0:                                   @ process tail\n"
+            "sub		%[tails], %[tails], #4        @ tail--\n"
+            "cmp    %[tails], #4                  @ cmp with act bits\n"
+            "blt		3f                            @ jump to tail = 1\n"
             /* Unroll 0*/
             "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
             "vmla.f32	q4, q2, d0[0]               @ out0 += b1 * a0\n"
@@ -3471,9 +3740,10 @@ void sgemm_prepacked_6x8(bool is_transB,
             "vmla.f32	q8, q2, d1[0]               @ out2 += b1 * a2\n"
             "vmla.f32	q10, q2, d1[1]              @ out3 += b1 * a3\n"
             "vmla.f32	q12, q2, d2[0]              @ out4 += b1 * a4\n"
-            "subs		%[tails], %[tails], #1      @ tail--\n"
+            "sub		%[tails], %[tails], #4        @ tail--\n"
             "vmla.f32	q14, q2, d2[1]              @ out5 += b1 * a5\n"
             "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
+            "cmp    %[tails], #4                  @ cmp with act bits\n"
             "vmla.f32	q5, q3, d0[0]               @ out6 += b2 * a0\n"
             "vmla.f32	q7, q3, d0[1]               @ out7 += b2 * a1\n"
             "vmla.f32	q9, q3, d1[0]               @ out8 += b2 * a2\n"
@@ -3482,16 +3752,17 @@ void sgemm_prepacked_6x8(bool is_transB,
             "vmla.f32	q13, q3, d2[0]              @ out10 += b2 * a4\n"
             "vmla.f32	q15, q3, d2[1]              @ out11 += b2 * a5\n"
             "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "beq		4f                          @ jump to tail==2\n"
+            "blt		4f                            @ jump to tail==2\n"
             /* Unroll 1*/
             "vmla.f32	q4, q2, d3[0]               @ out0 += b1 * a0\n"
             "vmla.f32	q6, q2, d3[1]               @ out1 += b1 * a1\n"
-            "subs		%[tails], %[tails], #1      @ tail--\n"
+            "sub		%[tails], %[tails], #4        @ tail--\n"
             "vmla.f32	q8, q2, d0[0]               @ out2 += b1 * a2\n"
             "vmla.f32	q10, q2, d0[1]              @ out3 += b1 * a3\n"
             "vmla.f32	q12, q2, d1[0]              @ out4 += b1 * a4\n"
             "vmla.f32	q14, q2, d1[1]              @ out5 += b1 * a5\n"
             "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
+            "cmp    %[tails],  #4                 @ cmp with act bits\n"
             "vmla.f32	q5, q3, d3[0]               @ out6 += b2 * a0\n"
             "vmla.f32	q7, q3, d3[1]               @ out7 += b2 * a1\n"
             "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a0~a3\n"
@@ -3500,8 +3771,9 @@ void sgemm_prepacked_6x8(bool is_transB,
             "vmla.f32	q13, q3, d1[0]              @ out10 += b2 * a4\n"
             "vmla.f32	q15, q3, d1[1]              @ out11 += b2 * a5\n"
             "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "beq		5f                          @ jump to tail==3\n"
+            "blt		5f                            @ jump to tail==3\n"
             /* Unroll 2 */
+            "sub		%[tails], %[tails], #4        @ tail--\n"
             "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a4,a5, a0,a1\n"
             "vmla.f32	q4, q2, d2[0]               @ out0 += b1 * a0\n"
             "vmla.f32	q6, q2, d2[1]               @ out1 += b1 * a1\n"
@@ -3579,7 +3851,99 @@ void sgemm_prepacked_6x8(bool is_transB,
             "vmla.f32	q11, q3, d3[1]              @ out9 += b2 * a3\n"
             "vmla.f32	q13, q3, d0[0]              @ out10 += b2 * a4\n"
             "vmla.f32	q15, q3, d0[1]              @ out11 += b2 * a5\n"
-            "2:                                     @ check relu\n"
+            "2:                                   @ check activation\n"
+            //!   relu
+            "cmp        %[tails], #1              @ check if has relu\n"
+            "bne        6f                        @ jump if not relu \n"
+            "vmov.u32   q0, #0                    @ for relu\n"
+            "vmax.f32   q4, q4, q0                @ for relu\n"
+            "vmax.f32   q5, q5, q0                @ for relu\n"
+            "vmax.f32   q6, q6, q0                @ for relu\n"
+            "vmax.f32   q7, q7, q0                @ for relu\n"
+            "vmax.f32   q8, q8, q0                @ for relu\n"
+            "vmax.f32   q9, q9, q0                @ for relu\n"
+            "vmax.f32   q10, q10, q0              @ for relu\n"
+            "vmax.f32   q11, q11, q0              @ for relu\n"
+            "vmax.f32   q12, q12, q0              @ for relu\n"
+            "vmax.f32   q13, q13, q0              @ for relu\n"
+            "vmax.f32   q14, q14, q0              @ for relu\n"
+            "vmax.f32   q15, q15, q0              @ for relu\n"
+            "b          10f                       @ relu end\n"
+            "6:                                   @ no relu \n"
+            "cmp        %[tails], #0              @ check no act\n"
+            "beq        10f                       @ no act end  \n"
+            //!   relu6
+            "cmp        %[tails], #2              @ check if has relu6\n"  
+            "bne        7f                        @ jump if no relu6 \n"
+            "vmov.u32   q0, #0                    @ for relu6\n"
+            "vmax.f32   q4, q4, q0                @ for relu6\n"
+            "vmax.f32   q5, q5, q0                @ for relu6\n"
+            "vmax.f32   q6, q6, q0                @ for relu6\n"
+            "vmax.f32   q7, q7, q0                @ for relu6\n"
+            "vmax.f32   q8, q8, q0                @ for relu6\n"
+            "vmax.f32   q9, q9, q0                @ for relu6\n"
+            "vld1.f32   {d2-d3}, [%[alpha]]       @ load relu6 alpha\n"
+            "vmax.f32   q10, q10, q0              @ for relu6\n"
+            "vmax.f32   q11, q11, q0              @ for relu6\n"
+            "vmax.f32   q12, q12, q0              @ for relu6\n"
+            "vmax.f32   q13, q13, q0              @ for relu6\n"
+            "vmax.f32   q14, q14, q0              @ for relu6\n"
+            "vmax.f32   q15, q15, q0              @ for relu6\n"
+
+            "vmin.f32   q4, q4, q1                @ for relu6\n"
+            "vmin.f32   q5, q5, q1                @ for relu6\n"
+            "vmin.f32   q6, q6, q1                @ for relu6\n"
+            "vmin.f32   q7, q7, q1                @ for relu6\n"
+            "vmin.f32   q8, q8, q1                @ for relu6\n"
+            "vmin.f32   q9, q9, q1                @ for relu6\n"
+            "vmin.f32   q10, q10, q1              @ for relu6\n"
+            "vmin.f32   q11, q11, q1              @ for relu6\n"
+            "vmin.f32   q12, q12, q1              @ for relu6\n"
+            "vmin.f32   q13, q13, q1              @ for relu6\n"
+            "vmin.f32   q14, q14, q1              @ for relu6\n"
+            "vmin.f32   q15, q15, q1              @ for relu6\n"
+            "b          10f                       @ relu6 end \n"
+            //! leakey relu
+            "7:                                   @ otherwise is leakey relu\n" 
+            "vmov.u32   q0,   #0                  @ for leakey relu \n"
+            "vld1.f32   {d2-d3}, [%[alpha]]       @ load leakey relu alpha\n"
+            "vcge.f32   q2, q4, q0                @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q4, q1                @ vmulq_f32 \n"  
+            "vbif       q4, q3, q2                @ choose    \n" 
+            "vcge.f32   q2, q5, q0                @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q5, q1                @ vmulq_f32 \n"  
+            "vbif       q5, q3, q2                @ choose    \n"
+            "vcge.f32   q2, q6, q0                @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q6, q1                @ vmulq_f32 \n"  
+            "vbif       q6, q3, q2                @ choose    \n" 
+            "vcge.f32   q2, q7, q0                @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q7, q1                @ vmulq_f32 \n"  
+            "vbif       q7, q3, q2                @ choose    \n" 
+            "vcge.f32   q2, q8, q0                @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q8, q1                @ vmulq_f32 \n"  
+            "vbif       q8, q3, q2                @ choose    \n"     
+            "vcge.f32   q2, q9, q0                @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q9, q1                @ vmulq_f32 \n"  
+            "vbif       q9, q3, q2                @ choose    \n"
+            "vcge.f32   q2, q10, q0               @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q10, q1               @ vmulq_f32 \n"  
+            "vbif       q10, q3, q2               @ choose    \n" 
+            "vcge.f32   q2, q11, q0               @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q11, q1               @ vmulq_f32 \n"  
+            "vbif       q11, q3, q2               @ choose    \n" 
+            "vcge.f32   q2, q12, q0               @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q12, q1               @ vmulq_f32 \n"  
+            "vbif       q12, q3, q2               @ choose    \n" 
+            "vcge.f32   q2, q13, q0               @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q13, q1               @ vmulq_f32 \n"  
+            "vbif       q13, q3, q2               @ choose    \n" 
+            "vcge.f32   q2, q14, q0               @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q14, q1               @ vmulq_f32 \n"  
+            "vbif       q14, q3, q2               @ choose    \n" 
+            "vcge.f32   q2, q15, q0               @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q15, q1               @ vmulq_f32 \n"  
+            "vbif       q15, q3, q2               @ choose    \n" 
+            "10:                                  @ act end  \n"
             "vst1.32    {d8-d11},   [%[c_ptr0]]!    @ store r0\n"
             "vst1.32    {d12-d15},  [%[c_ptr1]]!    @ store r1\n"
             "vst1.32    {d16-d19},  [%[c_ptr2]]!    @ store r2\n"
@@ -3597,7 +3961,8 @@ void sgemm_prepacked_6x8(bool is_transB,
               [k] "+r"(k),
               [tails] "+r"(tails)
             : [bias_ptr] "r"(bias_local),
-              [beta] "r"(beta)
+              [beta] "r"(beta), 
+              [alpha] "r" (alpha)
             : "q0","q1","q2","q3","q4",
               "q5","q6","q7","q8","q9","q10","q11",
               "q12","q13","q14","q15","cc","memory");
@@ -3616,13 +3981,6 @@ void sgemm_prepacked_6x8(bool is_transB,
       }
     }
   }
-  if (act_param.has_active) {
-#pragma omp parallel for num_threads(threads)
-    for (unsigned int x = 0; x < M; x++) {
-      float* dst = C + x * ldc;
-      act_switch_process(dst, dst, N, &act_param);
-    }
-  }
 }
 
 void sgemm_prepacked_4x8(bool is_transB,
@@ -3642,6 +4000,28 @@ void sgemm_prepacked_4x8(bool is_transB,
   size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
   auto* workspace = ctx->workspace_data<float>();
   int threads = ctx->threads();
+  auto act_type = act_param.active_type;
+  float alpha[4] = {0.f, 0.f, 0.f, 0.f};
+  int flag_act = 0x00;  // relu: 1, relu6: 2, leakey: 3
+  if (act_param.has_active) {
+    if (act_type == lite_api::ActivationType::kRelu) {
+      flag_act = 0x01;
+    } else if (act_type == lite_api::ActivationType::kRelu6) {
+      flag_act = 0x02;
+      float local_alpha = act_param.Relu_clipped_coef;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    } else if (act_type == lite_api::ActivationType::kLeakyRelu) {
+      flag_act = 0x03;
+      float local_alpha = act_param.Leaky_relu_alpha;
+      alpha[0] = local_alpha;
+      alpha[1] = local_alpha;
+      alpha[2] = local_alpha;
+      alpha[3] = local_alpha;
+    }
+  }
   //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
   int x_block =
       (l2_cache - (MBLOCK_A73 * K)) / (sizeof(float) * (K + MBLOCK_A73));
@@ -3786,13 +4166,13 @@ void sgemm_prepacked_4x8(bool is_transB,
             "vmla.f32   q15, q3, q4\n"          /* cr31 += beta * c_r31 */
             "11: \n"                            /* check loop count */
             "vld1.32	{d0-d3}, [%[a_ptr] :128]!   @ load a0~a3\n"
-            "vld1.32   {d8-d11}, [%[b_ptr] :128]!   @ load b1\n"
-            "cmp %[k], #0                    @ check weather k is bigger than "
+            "vld1.32   {d8-d11}, [%[b_ptr] :128]! @ load b1\n"
+            "cmp %[k], #0                         @ check weather k is bigger than "
             "0\n"
-            "beq 0f                                 @ jump to tail\n"
-            "1:                                     @ main loop for k\n"
+            "beq 0f                               @ jump to tail\n"
+            "1:                                   @ main loop for k\n"
             /* Unroll 0*/
-            "vld1.32  {d12-d15}, [%[b_ptr] :128]!   @ load next b1, b2\n"
+            "vld1.32  {d12-d15}, [%[b_ptr] :128]! @ load next b1, b2\n"
             "vmla.f32	q8, q4, d0[0]               @ out0 += b1 * a0\n"
             "vld1.32	{d4-d7}, [%[a_ptr] :128]!   @ load next 2xa0~a3\n"
             "vmla.f32	q10, q4, d0[1]              @ out1 += b1 * a1\n"
@@ -3920,8 +4300,76 @@ void sgemm_prepacked_4x8(bool is_transB,
             "vmla.f32	q13, q5, d5[0]              @ out6 += b2 * a2\n"
             "vmla.f32	q15, q5, d5[1]              @ out7 += b2 * a3\n"
             /*aptr - 16*/
-            "sub		%[a_ptr], %[a_ptr], #16     @ tail--\n"
-            "2:                                     @ check relu\n"
+            "sub		%[a_ptr], %[a_ptr], #16       @ tail--\n"
+            "2:                                   @ check relu\n"
+            //!   relu
+            "cmp        %[flag_act], #1           @ check if has relu\n"
+            "bne        6f                        @ jump if not relu \n"
+            "vmov.u32   q0, #0                    @ for relu\n"
+            "vmax.f32   q8, q8, q0                @ for relu\n"
+            "vmax.f32   q9, q9, q0                @ for relu\n"
+            "vmax.f32   q10, q10, q0              @ for relu\n"
+            "vmax.f32   q11, q11, q0              @ for relu\n"
+            "vmax.f32   q12, q12, q0              @ for relu\n"
+            "vmax.f32   q13, q13, q0              @ for relu\n"
+            "vmax.f32   q14, q14, q0              @ for relu\n"
+            "vmax.f32   q15, q15, q0              @ for relu\n"
+            "b          10f                       @ relu end\n"
+            "6:                                   @ no relu \n"
+            "cmp        %[flag_act], #0           @ check no act\n"
+            "beq        10f                       @ no act end  \n"
+            //!   relu6
+            "cmp        %[flag_act], #2           @ check if has relu6\n"  
+            "bne        7f                        @ jump if no relu6 \n"
+            "vmov.u32   q0, #0                    @ for relu6\n"
+            "vld1.f32   {d2-d3}, [%[alpha]]       @ load relu6 alpha\n"
+            "vmax.f32   q8, q8, q0                @ for relu6\n"
+            "vmax.f32   q9, q9, q0                @ for relu6\n"
+            "vmax.f32   q10, q10, q0              @ for relu6\n"
+            "vmax.f32   q11, q11, q0              @ for relu6\n"
+            "vmax.f32   q12, q12, q0              @ for relu6\n"
+            "vmax.f32   q13, q13, q0              @ for relu6\n"
+            "vmax.f32   q14, q14, q0              @ for relu6\n"
+            "vmax.f32   q15, q15, q0              @ for relu6\n"
+
+            "vmin.f32   q8, q8, q1                @ for relu6\n"
+            "vmin.f32   q9, q9, q1                @ for relu6\n"
+            "vmin.f32   q10, q10, q1              @ for relu6\n"
+            "vmin.f32   q11, q11, q1              @ for relu6\n"
+            "vmin.f32   q12, q12, q1              @ for relu6\n"
+            "vmin.f32   q13, q13, q1              @ for relu6\n"
+            "vmin.f32   q14, q14, q1              @ for relu6\n"
+            "vmin.f32   q15, q15, q1              @ for relu6\n"
+            "b          10f                       @ relu6 end \n"
+            //! leakey relu
+            "7:                                   @ otherwise is leakey relu\n" 
+            "vmov.u32   q0,   #0                  @ for leakey relu \n"
+            "vld1.f32   {d2-d3}, [%[alpha]]       @ load leakey relu alpha\n"
+            "vcge.f32   q2, q8, q0                @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q8, q1                @ vmulq_f32 \n"  
+            "vbif       q8, q3, q2                @ choose    \n"     
+            "vcge.f32   q2, q9, q0                @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q9, q1                @ vmulq_f32 \n"  
+            "vbif       q9, q3, q2                @ choose    \n"
+            "vcge.f32   q2, q10, q0               @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q10, q1               @ vmulq_f32 \n"  
+            "vbif       q10, q3, q2               @ choose    \n" 
+            "vcge.f32   q2, q11, q0               @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q11, q1               @ vmulq_f32 \n"  
+            "vbif       q11, q3, q2               @ choose    \n" 
+            "vcge.f32   q2, q12, q0               @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q12, q1               @ vmulq_f32 \n"  
+            "vbif       q12, q3, q2               @ choose    \n" 
+            "vcge.f32   q2, q13, q0               @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q13, q1               @ vmulq_f32 \n"  
+            "vbif       q13, q3, q2               @ choose    \n" 
+            "vcge.f32   q2, q14, q0               @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q14, q1               @ vmulq_f32 \n"  
+            "vbif       q14, q3, q2               @ choose    \n" 
+            "vcge.f32   q2, q15, q0               @ vcgeq_u32 \n"  
+            "vmul.f32   q3, q15, q1               @ vmulq_f32 \n"  
+            "vbif       q15, q3, q2               @ choose    \n" 
+            "10:                                  @ act end  \n"
             "vst1.32    {d16-d19},  [%[c_ptr0]]!    @ store r0\n"
             "vst1.32    {d20-d23},  [%[c_ptr1]]!    @ store r1\n"
             "vst1.32    {d24-d27},  [%[c_ptr2]]!    @ store r2\n"
@@ -3935,7 +4383,9 @@ void sgemm_prepacked_4x8(bool is_transB,
               [k] "+r"(k),
               [tails] "+r"(tails)
             : [bias_ptr] "r"(bias_local),
-              [beta] "r"(beta)
+              [beta] "r"(beta),
+              [alpha] "r"(alpha),
+              [flag_act] "r"(flag_act)
             : "q0","q1","q2","q3",
               "q4","q5","q6","q7","q8","q9","q10",
               "q11","q12","q13","q14","q15","cc","memory");
@@ -3951,13 +4401,6 @@ void sgemm_prepacked_4x8(bool is_transB,
       }
     }
   }
-  if (act_param.has_active) {
-#pragma omp parallel for num_threads(threads)
-    for (unsigned int x = 0; x < M; x++) {
-      float* dst = C + x * ldc;
-      act_switch_process(dst, dst, N, &act_param);
-    }
-  }
 }
 #endif  // __aarch64__
 
diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc
index 07cbd00378c082e311e194c7b22b6d3cb195a63a..0955b09d92f64066000b03c4487f359880f1c2a5 100644
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -67,7 +67,6 @@ void pooling_basic(const float* din,
       }
     } else if (pooling_type == "avg") {
       // Pooling_average_include_padding
-      // Pooling_average_exclude_padding
       for (int n = 0; n < num; ++n) {
         float* dout_batch = dout + n * chout * size_channel_out;
         const float* din_batch = din + n * chin * size_channel_in;
@@ -906,7 +905,9 @@ void pooling1x1s2p0_max(const float* din,
                         int wout,
                         int chin,
                         int hin,
-                        int win) {
+                        int win,
+                        int pad_bottom,
+                        int pad_right) {
   int size_channel_out = wout * hout;
   int size_channel_in = win * hin;
   auto data_out = static_cast<float*>(dout);
@@ -1021,7 +1022,9 @@ void pooling2x2s2_max(const float* din,
                       int wout,
                       int chin,
                       int hin,
-                      int win) {
+                      int win,
+                      int pad_bottom,
+                      int pad_right) {
   int size_channel_out = wout * hout;
   int size_channel_in = win * hin;
   auto data_out = static_cast<float*>(dout);
@@ -1104,7 +1107,9 @@ void pooling2x2s2_avg(const float* din,
                       int chin,
                       int hin,
                       int win,
-                      bool exclusive) {
+                      bool exclusive,
+                      int pad_bottom,
+                      int pad_right) {
   int size_channel_out = wout * hout;
   int size_channel_in = win * hin;
   auto data_out = static_cast<float*>(dout);
@@ -1117,6 +1122,9 @@ void pooling2x2s2_avg(const float* din,
   int w_unroll_size = wout / 4;
   int w_unroll_remian = wout - w_unroll_size * 4;
   float32x4_t vcoef = vdupq_n_f32(0.25f);  // divided by 4
+  auto zero_ptr =
+      static_cast<float*>(TargetMalloc(TARGET(kARM), win * sizeof(float)));
+  memset(zero_ptr, 0, win * sizeof(float));
 
   for (int n = 0; n < num; ++n) {
     float* data_out_batch = data_out + n * chout * size_channel_out;
@@ -1132,7 +1140,7 @@ void pooling2x2s2_avg(const float* din,
         auto dr0 = r0;
         auto dr1 = r1;
         if (h * S + K - P > hin) {
-          dr1 = r0;
+          dr1 = zero_ptr;
         }
         int cnt_num = w_unroll_size;
         if (w_unroll_size > 0) {
@@ -1178,6 +1186,7 @@ void pooling2x2s2_avg(const float* din,
       }
     }
   }
+  TargetFree(TARGET(kARM), zero_ptr);
 }
 
 void pooling3x3s1p1_max(const float* din,
@@ -1188,7 +1197,9 @@ void pooling3x3s1p1_max(const float* din,
                         int wout,
                         int chin,
                         int hin,
-                        int win) {
+                        int win,
+                        int pad_bottom,
+                        int pad_right) {
   int size_channel_out = wout * hout;
   int size_channel_in = win * hin;
   auto data_out = static_cast<float*>(dout);
@@ -1331,7 +1342,9 @@ void pooling3x3s1p1_avg(const float* din,
                         int chin,
                         int hin,
                         int win,
-                        bool exclusive) {
+                        bool exclusive,
+                        int pad_bottom,
+                        int pad_right) {
   int size_channel_out = wout * hout;
   int size_channel_in = win * hin;
   auto data_out = static_cast<float*>(dout);
@@ -1389,7 +1402,13 @@ void pooling3x3s1p1_avg(const float* din,
               if (exclusive) {
                 coef_h = 1.f;
               } else {
-                coef_h = 0.5f;
+                if (pad_bottom > 1) {
+                  coef_h = 1.f / 3;
+                } else if (pad_bottom == 1) {
+                  coef_h = 0.5f;
+                } else {
+                  coef_h = 1.f;
+                }
               }
               break;
             case 1:
@@ -1401,7 +1420,11 @@ void pooling3x3s1p1_avg(const float* din,
                   coef_h = 0.5f;
                 }
               } else {
-                coef_h = 1.f / 3;
+                if (pad_bottom >= 1) {
+                  coef_h = 1.f / 3;
+                } else {
+                  coef_h = 0.5f;
+                }
               }
             default:
               break;
@@ -1477,8 +1500,12 @@ void pooling3x3s1p1_avg(const float* din,
           int st = wstart > 0 ? wstart : 0;
           if (wstart + K > win) {
             wend = win;
-            if (!exclusive && wstart + K - win == 2) {
-              coef = coef_h / 2;
+            if (!exclusive) {
+              if (wstart + K - pad_right - win == 1) {
+                coef = coef_h / 2;
+              } else if (wstart + K - pad_right - win == 2) {
+                coef = coef_h;
+              }
             }
           }
           if (exclusive) {
@@ -1509,7 +1536,9 @@ void pooling3x3s1p0_max(const float* din,
                         int wout,
                         int chin,
                         int hin,
-                        int win) {
+                        int win,
+                        int pad_bottom,
+                        int pad_right) {
   int size_channel_out = wout * hout;
   int size_channel_in = win * hin;
   auto data_out = static_cast<float*>(dout);
@@ -1646,7 +1675,9 @@ void pooling3x3s1p0_avg(const float* din,
                         int chin,
                         int hin,
                         int win,
-                        bool exclusive) {
+                        bool exclusive,
+                        int pad_bottom,
+                        int pad_right) {
   int size_channel_out = wout * hout;
   int size_channel_in = win * hin;
   auto data_out = static_cast<float*>(dout);
@@ -1692,7 +1723,13 @@ void pooling3x3s1p0_avg(const float* din,
               if (exclusive) {
                 coef_h = 1.f;
               } else {
-                coef_h = 0.5f;
+                if (pad_bottom > 1) {
+                  coef_h = 1.f / 3;
+                } else if (pad_bottom = 1) {
+                  coef_h = 0.5f;
+                } else {
+                  coef_h = 1.f;
+                }
               }
               break;
             case 1:
@@ -1704,7 +1741,11 @@ void pooling3x3s1p0_avg(const float* din,
                   coef_h = 0.5f;
                 }
               } else {
-                coef_h = 1.f / 3;
+                if (pad_bottom >= 1) {
+                  coef_h = 1.f / 3;
+                } else {
+                  coef_h = 0.5f;
+                }
               }
             default:
               break;
@@ -1776,8 +1817,12 @@ void pooling3x3s1p0_avg(const float* din,
           int st = wstart > 0 ? wstart : 0;
           if (wstart + K > win) {
             wend = win;
-            if (!exclusive && wstart + K - win == 2) {
-              coef = coef_h / 2;
+            if (!exclusive) {
+              if (wstart + K - pad_right - win == 1) {
+                coef = coef_h / 2;
+              } else if (wstart + K - pad_right - win == 2) {
+                coef = coef_h;
+              }
             }
           }
           if (exclusive) {
@@ -1811,7 +1856,9 @@ void pooling3x3s2p1_max(const float* din,
                         int wout,
                         int chin,
                         int hin,
-                        int win) {
+                        int win,
+                        int pad_bottom,
+                        int pad_right) {
   int size_channel_out = wout * hout;
   int size_channel_in = win * hin;
   auto data_out = static_cast<float*>(dout);
@@ -1955,7 +2002,9 @@ void pooling3x3s2p1_avg(const float* din,
                         int chin,
                         int hin,
                         int win,
-                        bool exclusive) {
+                        bool exclusive,
+                        int pad_bottom,
+                        int pad_right) {
   int size_channel_out = wout * hout;
   int size_channel_in = win * hin;
   auto data_out = static_cast<float*>(dout);
@@ -2015,7 +2064,13 @@ void pooling3x3s2p1_avg(const float* din,
               if (exclusive) {
                 coef_h = 1.f;
               } else {
-                coef_h = 0.5f;
+                if (pad_bottom > 1) {
+                  coef_h = 1.f / 3;
+                } else if (pad_bottom == 1) {
+                  coef_h = 0.5f;
+                } else {
+                  coef_h = 1.f;
+                }
               }
               break;
             case 1:
@@ -2027,7 +2082,11 @@ void pooling3x3s2p1_avg(const float* din,
                   coef_h = 0.5f;
                 }
               } else {
-                coef_h = 1.f / 3;
+                if (pad_bottom == 0) {
+                  coef_h = 1.f / 2;
+                } else {
+                  coef_h = 1.f / 3;
+                }
               }
             default:
               break;
@@ -2102,8 +2161,12 @@ void pooling3x3s2p1_avg(const float* din,
           float coef = coef_h / 3.f;
           if (wstart + K > win) {
             wend = win;
-            if (!exclusive && wstart + K - win == 2) {
-              coef = coef_h / 2;
+            if (!exclusive) {
+              if (wstart + K - pad_right - win == 1) {
+                coef = coef_h / 2;
+              } else if (wstart + K - pad_right - win == 2) {
+                coef = coef_h;
+              }
             }
           }
           int st = wstart > 0 ? wstart : 0;
@@ -2135,7 +2198,9 @@ void pooling3x3s2p0_max(const float* din,
                         int wout,
                         int chin,
                         int hin,
-                        int win) {
+                        int win,
+                        int pad_bottom,
+                        int pad_right) {
   const int K = 3;
   const int P = 0;
   const int S = 2;
@@ -2261,7 +2326,9 @@ void pooling3x3s2p0_avg(const float* din,
                         int chin,
                         int hin,
                         int win,
-                        bool exclusive) {
+                        bool exclusive,
+                        int pad_bottom,
+                        int pad_right) {
   const int K = 3;
   const int P = 0;
   const int S = 2;
@@ -2303,11 +2370,33 @@ void pooling3x3s2p0_avg(const float* din,
             case 2:
               dr1 = zero_ptr;
               dr2 = zero_ptr;
-              coef_h = 1.f;
+              if (exclusive) {
+                coef_h = 1.f;
+              } else {
+                if (pad_bottom >= 2) {
+                  coef_h = 1.f / 3;
+                } else if (pad_bottom == 1) {
+                  coef_h = 0.5f;
+                } else {
+                  coef_h = 1.0f;
+                }
+              }
               break;
             case 1:
               dr2 = zero_ptr;
-              coef_h = 0.5f;
+              if (exclusive) {
+                if (fabsf(coef_h - 0.5f) < 1e-6f) {
+                  coef_h = 1.f;
+                } else {
+                  coef_h = 0.5f;
+                }
+              } else {
+                if (pad_bottom >= 1) {
+                  coef_h = 1.0f / 3;
+                } else {
+                  coef_h = 0.5f;
+                }
+              }
               break;
             default:
               break;
@@ -2366,22 +2455,34 @@ void pooling3x3s2p0_avg(const float* din,
           dr2 -= 8;
         }
         // deal with right pad
-        int rem = win - (w_unroll_size * 4) * S;
-        int wstart = 0;
+        int wstart = w_unroll_size * 4 * S - P;
         for (int j = 0; j < w_unroll_remian; ++j) {
-          int wend = std::min(wstart + K, rem);
-          float coef = coef_h / (wend - wstart);
+          int wend = wstart + K;  // std::min(wstart + K, win);
+          float coef = coef_h / 3.f;
+          if (wstart + K > win) {
+            wend = win;
+            if (!exclusive) {
+              if (wstart + K - pad_right - win == 1) {
+                coef = coef_h / 2;
+              } else if (wstart + K - pad_right - win == 2) {
+                coef = coef_h;
+              }
+            }
+          }
+          int st = wstart > 0 ? wstart : 0;
+          if (exclusive) {
+            coef = coef_h / (wend - st);
+          }
           float tmp = 0.f;
-          for (int i = wstart; i < wend; i++) {
-            tmp += dr0[i];
-            tmp += dr1[i];
-            tmp += dr2[i];
+          for (int i = 0; i < wend - st; i++) {
+            tmp += dr0[i] + dr1[i] + dr2[i];
           }
-          tmp *= coef;
-          *(dr_out++) = tmp;
+          *(dr_out++) = tmp * coef;
+          dr0 += S - (st - wstart);
+          dr1 += S - (st - wstart);
+          dr2 += S - (st - wstart);
           wstart += S;
         }
-
         r0 = r2;
         r1 = r0 + win;
         r2 = r1 + win;
diff --git a/lite/backends/arm/math/pooling.h b/lite/backends/arm/math/pooling.h
index 701732cb453bfc9f2e970c83c8d713e70a205434..7bbffa8e2f4594da4be589569efc0ef18b8dd0da 100644
--- a/lite/backends/arm/math/pooling.h
+++ b/lite/backends/arm/math/pooling.h
@@ -72,7 +72,9 @@ void pooling1x1s2p0_max(const float* din,
                         int wout,
                         int chin,
                         int hin,
-                        int win);
+                        int win,
+                        int pad_bottom,
+                        int pad_right);
 
 void pooling2x2s2_max(const float* din,
                       float* dout,
@@ -82,7 +84,9 @@ void pooling2x2s2_max(const float* din,
                       int wout,
                       int chin,
                       int hin,
-                      int win);
+                      int win,
+                      int pad_bottom,
+                      int pad_right);
 
 void pooling2x2s2_avg(const float* din,
                       float* dout,
@@ -93,7 +97,9 @@ void pooling2x2s2_avg(const float* din,
                       int chin,
                       int hin,
                       int win,
-                      bool exclusive);
+                      bool exclusive,
+                      int pad_bottom,
+                      int pad_right);
 
 void pooling3x3s1p1_max(const float* din,
                         float* dout,
@@ -103,7 +109,9 @@ void pooling3x3s1p1_max(const float* din,
                         int wout,
                         int chin,
                         int hin,
-                        int win);
+                        int win,
+                        int pad_bottom,
+                        int pad_right);
 
 void pooling3x3s1p1_avg(const float* din,
                         float* dout,
@@ -114,7 +122,9 @@ void pooling3x3s1p1_avg(const float* din,
                         int chin,
                         int hin,
                         int win,
-                        bool exclusive);
+                        bool exclusive,
+                        int pad_bottom,
+                        int pad_right);
 
 void pooling3x3s2p1_max(const float* din,
                         float* dout,
@@ -124,7 +134,9 @@ void pooling3x3s2p1_max(const float* din,
                         int wout,
                         int chin,
                         int hin,
-                        int win);
+                        int win,
+                        int pad_bottom,
+                        int pad_right);
 
 void pooling3x3s1p0_max(const float* din,
                         float* dout,
@@ -134,7 +146,9 @@ void pooling3x3s1p0_max(const float* din,
                         int wout,
                         int chin,
                         int hin,
-                        int win);
+                        int win,
+                        int pad_bottom,
+                        int pad_right);
 
 void pooling3x3s1p0_avg(const float* din,
                         float* dout,
@@ -145,7 +159,9 @@ void pooling3x3s1p0_avg(const float* din,
                         int chin,
                         int hin,
                         int win,
-                        bool exclusive);
+                        bool exclusive,
+                        int pad_bottom,
+                        int pad_right);
 
 void pooling3x3s2p1_avg(const float* din,
                         float* dout,
@@ -156,7 +172,9 @@ void pooling3x3s2p1_avg(const float* din,
                         int chin,
                         int hin,
                         int win,
-                        bool exclusive);
+                        bool exclusive,
+                        int pad_bottom,
+                        int pad_right);
 
 void pooling3x3s2p0_max(const float* din,
                         float* dout,
@@ -166,7 +184,9 @@ void pooling3x3s2p0_max(const float* din,
                         int wout,
                         int chin,
                         int hin,
-                        int win);
+                        int win,
+                        int pad_bottom,
+                        int pad_right);
 
 void pooling3x3s2p0_avg(const float* din,
                         float* dout,
@@ -177,7 +197,9 @@ void pooling3x3s2p0_avg(const float* din,
                         int chin,
                         int hin,
                         int win,
-                        bool exclusive);
+                        bool exclusive,
+                        int pad_bottom,
+                        int pad_right);
 
 }  // namespace math
 }  // namespace arm
diff --git a/lite/backends/arm/math/scale.cc b/lite/backends/arm/math/scale.cc
index 7f2169a6456bb04bda228cf62b89a125e4e2bb2f..5aad98c05c56f85931b7a0276d0a85b426573c4c 100644
--- a/lite/backends/arm/math/scale.cc
+++ b/lite/backends/arm/math/scale.cc
@@ -58,6 +58,43 @@ void scale<float>(
   }
 }
 
+template <>
+void scale<int>(const int* din, int* dout, int num, int scale, int bias) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  int32x4_t vscale = vdupq_n_s32(scale);
+  int32x4_t vbias = vdupq_n_s32(bias);
+#pragma omp parallel for
+  for (int i = 0; i < cnt; i++) {
+    const int* din_ptr = din + (i << 4);
+    int* dout_ptr = dout + (i << 4);
+
+    int32x4_t din0 = vld1q_s32(din_ptr);
+    int32x4_t din1 = vld1q_s32(din_ptr + 4);
+    int32x4_t din2 = vld1q_s32(din_ptr + 8);
+    int32x4_t din3 = vld1q_s32(din_ptr + 12);
+
+    int32x4_t vsum1 = vmlaq_s32(vbias, din0, vscale);
+    int32x4_t vsum2 = vmlaq_s32(vbias, din1, vscale);
+    int32x4_t vsum3 = vmlaq_s32(vbias, din2, vscale);
+    int32x4_t vsum4 = vmlaq_s32(vbias, din3, vscale);
+
+    vst1q_s32(dout_ptr, vsum1);
+    vst1q_s32(dout_ptr + 4, vsum2);
+    vst1q_s32(dout_ptr + 8, vsum3);
+    vst1q_s32(dout_ptr + 12, vsum4);
+  }
+  if (remain > 0) {
+    const int* din_ptr = din + (cnt << 4);
+    int* dout_ptr = dout + (cnt << 4);
+    for (int i = 0; i < remain; i++) {
+      *dout_ptr = *din_ptr * scale + bias;
+      dout_ptr++;
+      din_ptr++;
+    }
+  }
+}
+
 template <>
 void scale<float>(const float* din,
                   float* dout,
diff --git a/lite/backends/arm/math/scale.h b/lite/backends/arm/math/scale.h
index a86528c9df18cd6ef807bc116686b766ad905d82..910bea5613997c05e9257507f8f84792e0071a53 100644
--- a/lite/backends/arm/math/scale.h
+++ b/lite/backends/arm/math/scale.h
@@ -13,14 +13,32 @@
 // limitations under the License.
 
 #pragma once
-
+#include "lite/core/tensor.h"
+#include "lite/operators/op_params.h"
 namespace paddle {
 namespace lite {
 namespace arm {
 namespace math {
 
+template <typename dtype>
+void scale_compute_basic(const operators::ScaleParam& param) {
+  const dtype* x_data = param.x->data<dtype>();
+  dtype* output_data = param.output->mutable_data<dtype>();
+  DDim x_dims = param.x->dims();
+  DDim output_dims = param.output->dims();
+  bool bias_after_scale = param.bias_after_scale;
+  float scale = param.scale;
+  float bias = param.bias;
+  if (!bias_after_scale) {
+    bias *= scale;
+  }
+  for (int i = 0; i < output_dims.production(); i++) {
+    output_data[i] = static_cast<dtype>(x_data[i] * scale + bias);
+  }
+}
+
 template <typename T>
-void scale(const T* din, T* dout, int num, float scale, float bias);
+void scale(const T* din, T* dout, int num, T scale, T bias);
 
 template <typename T>
 void scale(const T* din,
diff --git a/lite/backends/arm/math/sgemv.cc b/lite/backends/arm/math/sgemv.cc
index 98404fe60fdb1384d390458e10dac8c967fd2b21..a7d4322326c9413878264400ba8118b510fade10 100644
--- a/lite/backends/arm/math/sgemv.cc
+++ b/lite/backends/arm/math/sgemv.cc
@@ -922,7 +922,7 @@ void sgemv_trans(const int M,
   /* end */                                                 \
   "4:                           \n" /* end */               \
   "fmov   s1, %w[alpha]         \n" /* mov alpha to s1  */  \
-  "fcmp   s8, #0                \n" /* cmp with zero*/      \
+  "fcmp   s8, #0.0              \n" /* cmp with zero*/      \
   "bge    5f                    \n" /* if ge zero */        \
   "fmul   s8, s8, s1            \n" /* out * alpha */       \
   "5:                           \n" /* leakey relu label */ \
@@ -983,10 +983,12 @@ void sgemv_trans(const int M,
   "vld1.32 {d8-d11}, [%[in]]!     @ load input, q4, q5\n"                      \
   "vld1.32 {d12-d15}, [%[w0]]!    @ load weights r0, q6,q7\n"                  \
   "vld1.32 {d16-d19}, [%[w1]]!    @ load weights r1, q8,q9\n"                  \
-  "vld1.32 {d20-d23}, [%[w2]]!    @ load weights r2, q10,q11\n"                \
-  "vld1.32 {d24-d27}, [%[w3]]!    @ load weights r3, q12,q13\n"                \
   "vmla.f32 q0, q4, q6            @ mul add\n"                                 \
+  "vld1.32 {d20-d23}, [%[w2]]!    @ load weights r2, q10,q11\n"                \
   "vmla.f32 q1, q4, q8            @ mul add\n"                                 \
+  "vld1.32 {d24-d27}, [%[w3]]!    @ load weights r3, q12,q13\n"                \
+  /*"vmla.f32 q0, q4, q6            @ mul add\n" */                            \
+  /*"vmla.f32 q1, q4, q8            @ mul add\n" */                            \
   "vmla.f32 q2, q4, q10           @ mul add\n"                                 \
   "vmla.f32 q3, q4, q12           @ mul add\n"                                 \
   "subs %[cnt], #1                @ sub loop count \n"                         \
diff --git a/lite/backends/arm/math/topk.cc b/lite/backends/arm/math/topk.cc
index c9239134e1c3988f5f9c39af6a69fec52fa0904f..83986dc1505098b0a23cdff31297e325fcb109a1 100644
--- a/lite/backends/arm/math/topk.cc
+++ b/lite/backends/arm/math/topk.cc
@@ -26,7 +26,7 @@ bool comp_func(std::pair<float, int> a, std::pair<float, int> b) {
 
 void topk(const float* in_data,
           float* out_val,
-          int* out_ind,
+          int64_t* out_ind,
           int m,
           int n,
           int k,
@@ -34,7 +34,7 @@ void topk(const float* in_data,
   for (int i = 0; i < m; i++) {
     const float* in_tmp = in_data + i * n;
     float* out_val_tmp = out_val + i * k;
-    int* out_ind_tmp = out_ind + i * k;
+    int64_t* out_ind_tmp = out_ind + i * k;
     std::vector<std::pair<float, int>> vec;
     for (int j = 0; j < n; j++) {
       vec.push_back(std::make_pair(in_tmp[j], j));
diff --git a/lite/backends/arm/math/topk.h b/lite/backends/arm/math/topk.h
index 5bf472e1af497398309689151f0d5354b3a48f27..a6716623228e6df0598410f52de56db58be7a8dc 100644
--- a/lite/backends/arm/math/topk.h
+++ b/lite/backends/arm/math/topk.h
@@ -22,7 +22,7 @@ namespace math {
 
 void topk(const float* din,
           float* out_val,
-          int* out_ind,
+          int64_t* out_ind,
           int m,
           int n,
           int k,
diff --git a/lite/backends/arm/math/type_trans.cc b/lite/backends/arm/math/type_trans.cc
index c50abb741ded487efa03d7d46baf2c6f13a8791d..c7c2da678bf55c45c2a2702ed413cf6bfc135c6a 100644
--- a/lite/backends/arm/math/type_trans.cc
+++ b/lite/backends/arm/math/type_trans.cc
@@ -40,13 +40,11 @@ void fp32_to_int8(const float* din,
   int cnt = inner_size / 16;
   int remain = inner_size & 15;
   int64_t loop_size = outer_size * axis_size;
-
 #pragma omp parallel for
   for (int j = 0; j < loop_size; ++j) {
     float inv_scale = 1.f / scale[j % axis_size];
     float32x4_t vzero = vdupq_n_f32(0.f);
     float32x4_t vscale = vdupq_n_f32(inv_scale);
-    float32x4_t vmax = vdupq_n_f32(-127.f);
     float32x4_t vpoff = vdupq_n_f32(0.5f);
     float32x4_t vnoff = vdupq_n_f32(-0.5f);
     const float* din_c = din + j * inner_size;
@@ -56,6 +54,7 @@ void fp32_to_int8(const float* din,
       const float* din_ptr = din_c;
       signed char* dout_ptr = dout_c;
 #ifdef __aarch64__
+      float32x4_t vmax = vdupq_n_f32(-127.0);
       asm volatile(
           "ldp q0, q1, [%[in]], #32                           \n"
           "ldp q2, q3, [%[in]], #32                   \n"
@@ -64,16 +63,19 @@ void fp32_to_int8(const float* din,
           "fmul v5.4s, v1.4s, %[scale].4s             \n"
           "fmul v6.4s, v2.4s, %[scale].4s             \n"
           "fmul v7.4s, v3.4s, %[scale].4s             \n"
+          /* data >= -127 */
           "fcmge v8.4s, v4.4s, %[vmax].4s             \n"
           "fcmge v9.4s, v5.4s, %[vmax].4s             \n"
           "fcmge v10.4s, v6.4s, %[vmax].4s            \n"
           "fcmge v11.4s, v7.4s, %[vmax].4s            \n"
+          /* choose data */
           "bif v4.16b, %[vmax].16b, v8.16b            \n"
           "bif v5.16b, %[vmax].16b, v9.16b            \n"
           "bif v6.16b, %[vmax].16b, v10.16b            \n"
           "bif v7.16b, %[vmax].16b, v11.16b            \n"
           "ldp q0, q1, [%[in]], #32                   \n"
           "subs %[cnt], %[cnt], #1                    \n"
+          /* fp32 - int32 */
           "FCVTAS v8.4s, v4.4s                        \n"
           "FCVTAS v9.4s, v5.4s                        \n"
           "FCVTAS v10.4s, v6.4s                       \n"
@@ -89,7 +91,9 @@ void fp32_to_int8(const float* din,
           "bne    0b                                  \n"
           : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
           : [scale] "w"(vscale), [vmax] "w"(vmax)
-          : "v0",
+          : "cc",
+            "memory",
+            "v0",
             "v1",
             "v2",
             "v3",
@@ -102,6 +106,7 @@ void fp32_to_int8(const float* din,
             "v10",
             "v11");
 #else
+      float vmax[4] = {-127.0, -127.0, -127.0, -127.0};
       asm volatile(
           "vld1.32 {d0-d3},    [%[din]]!                  @ load in0~in7\n"
           "vld1.32    {d4-d7},    [%[din]]!       @ load in8~in16\n"
@@ -113,23 +118,27 @@ void fp32_to_int8(const float* din,
           "vcgt.f32   q8, q0, %q[vzero]           @ get mask > 0, in0\n"
           "vcgt.f32   q9, q1, %q[vzero]           @ get mask > 0, in1\n"
           "vcgt.f32   q10, q2, %q[vzero]          @ get mask > 0, in2\n"
+          "vcgt.f32   q11, q3, %q[vzero]          @ get mask > 0, in3\n"
           "vbif.f32   q4, %q[vnoff], q8           @ get right offset\n"
-          "vcgt.f32   q8, q3, %q[vzero]          @ get mask > 0, in3\n"
           "vbif.f32   q5, %q[vnoff], q9           @ get right offset\n"
           "vbif.f32   q6, %q[vnoff], q10          @ get right offset\n"
-          "vbif.f32   q7, %q[vnoff], q8          @ get right offset\n"
+          "vbif.f32   q7, %q[vnoff], q11          @ get right offset\n"
           "vmla.f32   q4, q0, %q[vscale]          @ mul scale\n"
+          "vld1.32    {d0-d1}, [%[vmax]]          @ set q0 = -127 \n"
           "vmla.f32   q5, q1, %q[vscale]          @ mul scale\n"
           "vmla.f32   q6, q2, %q[vscale]          @ mul scale\n"
           "vmla.f32   q7, q3, %q[vscale]          @ mul scale\n"
-          "vcge.f32 q8, q4, %q[vmax]              @ q4 >= vmax \n"
-          "vcge.f32 q9, q5, %q[vmax]              @ q4 >= vmax \n"
-          "vcge.f32 q10, q6, %q[vmax]             @ q4 >= vmax \n"
-          "vbif q4, %q[vmax], q8                  @ choose \n"
-          "vcge.f32 q8, q7, %q[vmax]             @ q4 >= vmax \n"
-          "vbif q5, %q[vmax], q9                  @ choose \n"
-          "vbif q6, %q[vmax], q10                  @ choose \n"
-          "vbif q7, %q[vmax], q8                  @ choose \n"
+          /* data >= -127 */
+          "vcge.f32 q8, q4, q0                    @ q4 >= -127 \n"
+          "vcge.f32 q9, q5, q0                    @ q4 >= -127 \n"
+          "vcge.f32 q10, q6, q0                   @ q4 >= -127 \n"
+          "vcge.f32 q11, q7, q0                   @ q4 >= -127 \n"
+          /* choose data */
+          "vbif q4, q0, q8                        @ choose \n"
+          "vbif q5, q0, q9                        @ choose \n"
+          "vbif q6, q0, q10                       @ choose \n"
+          "vbif q7, q0, q11                       @ choose \n"
+          /* fp32 - int32 */
           "vcvt.s32.f32  q0, q4                   @ cvt to int32\n"
           "vcvt.s32.f32  q1, q5                   @ cvt to int32\n"
           "vcvt.s32.f32  q2, q6                   @ cvt to int32\n"
@@ -150,9 +159,22 @@ void fp32_to_int8(const float* din,
           : [vscale] "w"(vscale),
             [vpoff] "w"(vpoff),
             [vnoff] "w"(vnoff),
-            [vzero] "w"(vzero),
-            [vmax] "w"(vmax)
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10");
+            [vmax] "r"(vmax),
+            [vzero] "w"(vzero)
+          : "cc",
+            "memory",
+            "q0",
+            "q1",
+            "q2",
+            "q3",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11");
 #endif
     }
     const float* din_r = din_c + 16 * cnt;
@@ -203,7 +225,7 @@ void fp32_to_int16(const float* din,
           "bne    0b                                  \n"
           : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
           : [scale] "w"(vscale)
-          : "v0", "v1", "v4", "v5", "v8", "v9");
+          : "cc", "memory", "v0", "v1", "v4", "v5", "v8", "v9");
 #else
       asm volatile(
           "vld1.32 {d0-d3}, [%[din]]!             @ load in0~in7\n"
@@ -232,7 +254,7 @@ void fp32_to_int16(const float* din,
             [vpoff] "w"(vpoff),
             [vnoff] "w"(vnoff),
             [vzero] "w"(vzero)
-          : "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
+          : "cc", "memory", "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
 #endif
     }
     const float* din_r = din_c + 8 * cnt;
@@ -294,7 +316,9 @@ void int8_to_fp32(const int8_t* in,
           "bne     0b                         \n"
           : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
           : [scale] "w"(vscale)
-          : "v0",
+          : "cc",
+            "memory",
+            "v0",
             "v1",
             "v2",
             "v3",
@@ -335,7 +359,7 @@ void int8_to_fp32(const int8_t* in,
           "bne           0b                     \n"
           : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
           : [scale] "w"(vscale)
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
 #endif  // __aarch64__
     }
     const signed char* din_r = din_c + 16 * cnt;
@@ -394,7 +418,18 @@ void int16_to_fp32(const int16_t* in,
           "bne     0b                         \n"
           : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
           : [scale] "w"(vscale)
-          : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
+          : "cc",
+            "memory",
+            "v0",
+            "v1",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11");
 #else
       asm volatile(
           "vld1.32    {d0-d3},    [%[in]]!            @ load 16 int16\n"
@@ -422,7 +457,7 @@ void int16_to_fp32(const int16_t* in,
           "bne           0b                     \n"
           : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
           : [scale] "w"(vscale)
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
 #endif  // __aarch64__
     }
     const int16_t* din_r = din_c + 16 * cnt;
@@ -473,7 +508,9 @@ void int32_to_fp32(const int* din,
           "bne     0b                         \n"
           : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
           : [scale] "w"(vscale)
-          : "v0",
+          : "cc",
+            "memory",
+            "v0",
             "v1",
             "v2",
             "v3",
@@ -506,7 +543,9 @@ void int32_to_fp32(const int* din,
           "bne            0b                      \n"
           : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
           : [scale] "w"(vscale)
-          : "q0",
+          : "cc",
+            "memory",
+            "q0",
             "q1",
             "q2",
             "q3",
@@ -551,41 +590,53 @@ void int32_to_int8(const int* din,
       const int* din_ptr = din_c;
       int8_t* dout_ptr = dout_c;
 #ifdef __aarch64__
+      float32x4_t vmax = vdupq_n_f32(-127.0);
       asm volatile(
           "0:                                        \n"
           "ld1     {v0.4s, v1.4s}, [%[in]], #32      \n"
           "ld1     {v2.4s, v3.4s}, [%[in]], #32      \n"
-
+          /* int32 - fp32 */
           "scvtf   v4.4s, v0.4s                      \n"
           "scvtf   v5.4s, v1.4s                      \n"
           "scvtf   v6.4s, v2.4s                      \n"
           "scvtf   v7.4s, v3.4s                      \n"
-
+          /* mul scale */
           "fmul    v0.4s, v4.4s, %[scale].4s         \n"
           "fmul    v1.4s, v5.4s, %[scale].4s         \n"
           "fmul    v2.4s, v6.4s, %[scale].4s         \n"
           "fmul    v3.4s, v7.4s, %[scale].4s         \n"
-
+          /* data >= -127 */
+          "fcmge v4.4s, v0.4s, %[vmax].4s             \n"
+          "fcmge v5.4s, v1.4s, %[vmax].4s             \n"
+          "fcmge v6.4s, v2.4s, %[vmax].4s            \n"
+          "fcmge v7.4s, v3.4s, %[vmax].4s            \n"
+          /* choose data */
+          "bif v0.16b, %[vmax].16b, v4.16b            \n"
+          "bif v1.16b, %[vmax].16b, v5.16b            \n"
+          "bif v2.16b, %[vmax].16b, v6.16b            \n"
+          "bif v3.16b, %[vmax].16b, v7.16b            \n"
+          /* fp32 - int32 */
           "fcvtas  v4.4s, v0.4s                      \n"
           "fcvtas  v5.4s, v1.4s                      \n"
           "fcvtas  v6.4s, v2.4s                      \n"
           "fcvtas  v7.4s, v3.4s                      \n"
-
+          /* int32 - int16 */
           "sqxtn   v0.4h, v4.4s                      \n"
           "sqxtn2  v0.8h, v5.4s                      \n"
           "sqxtn   v1.4h, v6.4s                      \n"
           "sqxtn2  v1.8h, v7.4s                      \n"
-
+          /* int16 - int8 */
           "sqxtn   v2.8b, v0.8h                      \n"
           "sqxtn2  v2.16b, v1.8h                     \n"
-
+          /* store */
           "st1     {v2.16b}, [%[out]], #16           \n"
           "subs    %[loop], %[loop], #1              \n"
           "bne     0b                                \n"
           : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
-          : [scale] "w"(vscale)
-          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+          : [scale] "w"(vscale), [vmax] "w"(vmax)
+          : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 #else
+      float vmax[4] = {-127.0, -127.0, -127.0, -127.0};
       asm volatile(
           "vld1.32 {d0-d3},    [%[din]]!                  @ load in0~in7\n"
           "vld1.32    {d4-d7},    [%[din]]!       @ load in8~in16\n"
@@ -607,9 +658,21 @@ void int32_to_int8(const int* din,
           "vbif.f32   q2, %q[vnoff], q10          @ get right offset\n"
           "vbif.f32   q3, %q[vnoff], q11          @ get right offset\n"
           "vmla.f32   q0, q4, %q[vscale]          @ mul scale\n"
+          "vld1.32    {d8-d9}, [%[vmax]]          @ set q4 = -127 \n"
           "vmla.f32   q1, q5, %q[vscale]          @ mul scale\n"
           "vmla.f32   q2, q6, %q[vscale]          @ mul scale\n"
           "vmla.f32   q3, q7, %q[vscale]          @ mul scale\n"
+          /* data >= -127 */
+          "vcge.f32 q8, q0, q4                    @ q0 >= -127 \n"
+          "vcge.f32 q9, q1, q4                    @ q1 >= -127 \n"
+          "vcge.f32 q10, q2, q4                   @ q2 >= -127 \n"
+          "vcge.f32 q11, q3, q4                   @ q3 >= -127 \n"
+          /* choose data */
+          "vbif q0, q4, q8                        @ choose \n"
+          "vbif q1, q4, q9                        @ choose \n"
+          "vbif q2, q4, q10                       @ choose \n"
+          "vbif q3, q4, q11                       @ choose \n"
+          /* fp32 - int32 */
           "vcvt.s32.f32  q4, q0                   @ cvt to int32\n"
           "vcvt.s32.f32  q5, q1                   @ cvt to int32\n"
           "vcvt.s32.f32  q6, q2                   @ cvt to int32\n"
@@ -628,9 +691,12 @@ void int32_to_int8(const int* din,
           : [loop] "+r"(loop), [din] "+r"(din_ptr), [dout] "+r"(dout_ptr)
           : [vscale] "w"(vscale),
             [vzero] "w"(vzero),
+            [vmax] "r"(vmax),
             [vnoff] "w"(vnoff),
             [vpoff] "w"(vpoff)
-          : "q0",
+          : "cc",
+            "memory",
+            "q0",
             "q1",
             "q2",
             "q3",
@@ -648,6 +714,7 @@ void int32_to_int8(const int* din,
     int8_t* dout_r = dout_c + 16 * cnt;
     for (int i = 0; i < remain; ++i) {
       dout_r[i] = saturate_cast<int8_t>(roundf(in_scale * din_r[i]));
+      dout_r[i] = dout_r[i] < -127 ? -127 : dout_r[i];
     }
   }
 }
@@ -682,7 +749,7 @@ float compute_max_kernel(const float* din, int64_t size) {
         "bne    0b                                 \n"
         : [in] "+r"(ptr_in), [cnt] "+r"(loop_cnt), [max_val] "+w"(vmax_val)
         :
-        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 #else
     asm volatile(
         "vld1.32   {d0-d3}, [%[in]]!                        @ load 8 float\n"
@@ -703,7 +770,7 @@ float compute_max_kernel(const float* din, int64_t size) {
 
         : [in] "+r"(ptr_in), [cnt] "+r"(loop_cnt), [max_val] "+w"(vmax_val)
         :
-        : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
 #endif
     float32x2_t vmax_p =
         vpmax_f32(vget_high_f32(vmax_val), vget_low_f32(vmax_val));
diff --git a/lite/backends/cuda/math/CMakeLists.txt b/lite/backends/cuda/math/CMakeLists.txt
index fafd74ae7a43d1a769456edfe408c71593d21201..d26b1188c0878916986575b72cc978926ba5a1f6 100644
--- a/lite/backends/cuda/math/CMakeLists.txt
+++ b/lite/backends/cuda/math/CMakeLists.txt
@@ -2,7 +2,7 @@ if(NOT LITE_WITH_CUDA)
     return()
 endif()
 
-get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES)
+get_property(cuda_static_deps GLOBAL PROPERTY CUDA_MODULES)
 
 nv_library(cuda_activation SRCS activation.cu DEPS ${cuda_static_deps})
 nv_library(cuda_scale SRCS scale.cu DEPS ${cuda_static_deps})
diff --git a/lite/backends/cuda/target_wrapper.h b/lite/backends/cuda/target_wrapper.h
index 5b57ddf0043c59219aded9836cc0b1ad982eec2d..3eeee84c1c46a65782e38b998bcd8142e08cbec1 100644
--- a/lite/backends/cuda/target_wrapper.h
+++ b/lite/backends/cuda/target_wrapper.h
@@ -39,13 +39,26 @@ class TargetWrapper<TARGET(kCUDA)> {
   static void CreateStream(stream_t* stream) {}
   static void DestroyStream(const stream_t& stream) {}
 
-  static void CreateEvent(event_t* event) {}
-  static void DestroyEvent(const event_t& event) {}
+  static void CreateEvent(event_t* event) { cudaEventCreate(event); }
+  static void CreateEventWithFlags(
+      event_t* event, unsigned int flags = cudaEventDisableTiming) {
+    cudaEventCreateWithFlags(event, flags);
+  }
+  static void DestroyEvent(const event_t& event) { cudaEventDestroy(event); }
 
   static void RecordEvent(const event_t& event) {}
+  static void RecordEvent(const event_t& event, const stream_t& stream) {
+    cudaEventRecord(event, stream);
+  }
   static void SyncEvent(const event_t& event) {}
 
-  static void StreamSync(const stream_t& stream) {}
+  static void StreamSync(const stream_t& stream) {
+    cudaStreamSynchronize(stream);
+  }
+  static void StreamSync(const stream_t& stream, const event_t& event) {
+    cudaStreamWaitEvent(stream, event, 0);
+  }
+  static void DeviceSync() { cudaDeviceSynchronize(); }
 
   static void* Malloc(size_t size);
   static void Free(void* ptr);
diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp
index 9b1189c407d6d601bb3e5ba8172b1455f04710fd..83b8dff70eb8de7cf1d117585d47118fed539a15 100755
--- a/lite/backends/fpga/KD/debugger.hpp
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -106,7 +106,7 @@ inline void read_from_file(lite::Tensor* t, const std::string& path) {
 
 inline void save_float(float* data, const std::string& name, int len) {
   static int counter = 0;
-  std::string old_string = std::to_string(counter);
+  std::string old_string = paddle::lite::to_string(counter);
   std::string new_string =
       std::string(3 - old_string.length(), '0') + old_string;
 
diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp
index 988bc1bb507036de8f13a6c6549c549718bd1256..12a60bd27da832b338dc6b1ca11b1c7d6aa192e4 100644
--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -351,10 +351,10 @@ class Tensor {
   void printScale(std::string type) { printScale(); }
 
   std::string dimsFileName() {
-    return std::to_string(shape_->num()) + "_" +
-           std::to_string(shape_->channel()) + "_" +
-           std::to_string(shape_->height()) + "_" +
-           std::to_string(shape_->width()) + ".txt";
+    return paddle::lite::to_string(shape_->num()) + "_" +
+           paddle::lite::to_string(shape_->channel()) + "_" +
+           paddle::lite::to_string(shape_->height()) + "_" +
+           paddle::lite::to_string(shape_->width()) + ".txt";
   }
 
   void saveToFile() { std::string path = dimsFileName(); }
@@ -374,7 +374,7 @@ class Tensor {
     invalidate();
     std::ofstream ofs;
     static int counter = 0;
-    std::string npath = std::to_string(counter) + "_" + path;
+    std::string npath = paddle::lite::to_string(counter) + "_" + path;
     counter++;
     save_file_with_name(npath);
   }
diff --git a/lite/backends/mlu/CMakeLists.txt b/lite/backends/mlu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..29c90b422044be4e6a7aa9f4a8da45018a41f11a
--- /dev/null
+++ b/lite/backends/mlu/CMakeLists.txt
@@ -0,0 +1,7 @@
+if(NOT LITE_WITH_MLU)
+    return()
+endif()
+
+message (STATUS "Lite with mlu backend")
+
+lite_cc_library(target_wrapper_mlu SRCS target_wrapper.cc DEPS cnml_lib cnrt_lib)
diff --git a/lite/backends/mlu/mlu_utils.h b/lite/backends/mlu/mlu_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..08dd355e8100a48363704168d264f6116ae58a79
--- /dev/null
+++ b/lite/backends/mlu/mlu_utils.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cnml.h>
+#include <cnrt.h>
+#include <lite/utils/cp_logging.h>
+
+/*
+ * This file contains some MLU specific uitls.
+ */
+
+#define CNRT_CALL(msg)                                    \
+  CHECK_EQ(static_cast<cnrtRet_t>(msg), CNRT_RET_SUCCESS) \
+      << (msg)                                            \
+      << " MLU CNRT: " << cnrtGetErrorStr(static_cast<cnrtRet_t>(msg))
+
+#define CNML_CALL(msg)                                          \
+  CHECK_EQ(static_cast<cnmlStatus_t>(msg), CNML_STATUS_SUCCESS) \
+      << (msg) << " MLU CNML: "                                 \
+      << ::paddle::lite::mlu::CnmlErrorInfo(static_cast<int>(msg))
+
+namespace paddle {
+namespace lite {
+namespace mlu {
+
+static const char* CnmlErrorInfo(int error) {
+  switch (error) {
+#define LITE_CNML_ERROR_INFO(xx) \
+  case xx:                       \
+    return #xx;                  \
+    break;
+    LITE_CNML_ERROR_INFO(CNML_STATUS_NODEVICE);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_SUCCESS);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_DOMAINERR);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_INVALIDARG);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_LENGTHERR);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_OUTOFRANGE);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_RANGEERR);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_OVERFLOWERR);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_UNDERFLOWERR);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_INVALIDPARAM);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_BADALLOC);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_BADTYPEID);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_BADCAST);
+    LITE_CNML_ERROR_INFO(CNML_STATUS_UNSUPPORT);
+#undef LITE_CNML_ERROR_INFO
+    default:
+      return "unknown error";
+      break;
+  }
+}
+
+}  // namespace mlu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/mlu/target_wrapper.cc b/lite/backends/mlu/target_wrapper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2385f69246a163830e0df855082d728da2743e02
--- /dev/null
+++ b/lite/backends/mlu/target_wrapper.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/mlu/target_wrapper.h"
+
+#include <memory>
+
+#include "lite/backends/mlu/mlu_utils.h"
+
+namespace paddle {
+namespace lite {
+namespace mlu {
+
+void cnrtMemcpyHtoD(void* dst, const void* src, size_t size) {
+  CNRT_CALL(cnrtMemcpy(
+      dst, const_cast<void*>(src), size, CNRT_MEM_TRANS_DIR_HOST2DEV))
+      << " cnrt memcpy htod failed";
+}
+
+void cnrtMemcpyDtoH(void* dst, const void* src, size_t size) {
+  CNRT_CALL(cnrtMemcpy(
+      dst, const_cast<void*>(src), size, CNRT_MEM_TRANS_DIR_DEV2HOST))
+      << " cnrt memcpy dtoh failed";
+}
+
+}  // namespace mlu
+
+size_t TargetWrapperMlu::num_devices() {
+  uint32_t dev_count = 0;
+  CNRT_CALL(cnrtGetDeviceCount(&dev_count)) << " cnrt get device count failed";
+  LOG(INFO) << "Current MLU device count: " << dev_count;
+  return dev_count;
+}
+
+void* TargetWrapperMlu::Malloc(size_t size) {
+  void* ptr{};
+  CNRT_CALL(cnrtMalloc(&ptr, size)) << " cnrt malloc failed";
+  // LOG(INFO) << "Malloc mlu ptr: " << ptr << " with size: " << size;
+  return ptr;
+}
+
+void TargetWrapperMlu::Free(void* ptr) {
+  CNRT_CALL(cnrtFree(ptr)) << " cnrt free failed";
+}
+
+void TargetWrapperMlu::MemcpySync(void* dst,
+                                  const void* src,
+                                  size_t size,
+                                  IoDirection dir) {
+  // LOG(INFO) << "dst: " << dst << " src: " << src << " size: " << size
+  //<< " dir: " << (int)dir;
+  switch (dir) {
+    case IoDirection::DtoD: {
+      std::unique_ptr<char[]> cpu_tmp_ptr(new char[size]);
+      mlu::cnrtMemcpyDtoH(cpu_tmp_ptr.get(), src, size);
+      mlu::cnrtMemcpyHtoD(dst, cpu_tmp_ptr.get(), size);
+      break;
+    }
+    case IoDirection::HtoD:
+      mlu::cnrtMemcpyHtoD(dst, src, size);
+      break;
+    case IoDirection::DtoH:
+      mlu::cnrtMemcpyDtoH(dst, src, size);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported IoDirection" << static_cast<int>(dir);
+  }
+}
+
+// void TargetWrapperMlu::MemcpyAsync(void* dst,
+//                                    const void* src,
+//                                    size_t size,
+//                                    IoDirection dir,
+//                                    const stream_t& stream) {
+//   LOG(WARNING) << "Mlu unsupported MemcpyAsync now, use MemcpySync.";
+//   MemcpySync(dst, src, size, dir);
+// }
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/mlu/target_wrapper.h b/lite/backends/mlu/target_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d9e10806f78e56f50b04d408dab219c923456fc
--- /dev/null
+++ b/lite/backends/mlu/target_wrapper.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/backends/mlu/mlu_utils.h"
+#include "lite/core/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+
+using TargetWrapperMlu = TargetWrapper<TARGET(kMLU)>;
+
+template <>
+class TargetWrapper<TARGET(kMLU)> {
+ public:
+  using queue_t = cnrtQueue_t;
+
+  static size_t num_devices();
+  static size_t maxinum_queue() { return 0; }  // TODO(zhangshijin): fix out it.
+
+  static size_t GetCurDevice() { return 0; }
+
+  static void CreateQueue(queue_t* queue) {}
+  static void DestroyQueue(const queue_t& queue) {}
+
+  static void QueueSync(const queue_t& queue) {}
+
+  static void* Malloc(size_t size);
+  static void Free(void* ptr);
+
+  static void MemcpySync(void* dst,
+                         const void* src,
+                         size_t size,
+                         IoDirection dir);
+  // static void MemcpyAsync(void* dst,
+  //                         const void* src,
+  //                         size_t size,
+  //                         IoDirection dir,
+  //                         const queue_t& queue);
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/npu/device.cc b/lite/backends/npu/device.cc
index d62ac9cad3e5ab4e6f63e3b667e3fa93e244fec1..345b239c320f04eba8426483a23a352e77a71036 100644
--- a/lite/backends/npu/device.cc
+++ b/lite/backends/npu/device.cc
@@ -19,8 +19,8 @@ namespace paddle {
 namespace lite {
 namespace npu {
 
-std::unique_ptr<hiai::AiModelMngerClient> Device::Build(
-    std::string& model_name,                 // NOLINT
+std::shared_ptr<hiai::AiModelMngerClient> Device::Build(
+    const std::string model_name,            // NOLINT
     std::vector<ge::Operator>& input_nodes,  // NOLINT
     std::vector<ge::Operator>& output_nodes  // NOLINT
     ) {
@@ -41,15 +41,15 @@ std::unique_ptr<hiai::AiModelMngerClient> Device::Build(
     ir_build.ReleaseModelBuff(om_model_buf);
     return nullptr;
   }
+
   // Create a HiAI model manager client to load the HiAI om model
-  std::unique_ptr<hiai::AiModelMngerClient> model_client(
+  std::shared_ptr<hiai::AiModelMngerClient> model_client(
       new hiai::AiModelMngerClient());
   if (model_client->Init(nullptr) != hiai::AI_SUCCESS) {
     LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!";
     ir_build.ReleaseModelBuff(om_model_buf);
     return nullptr;
   }
-  model_name = "model_" + std::to_string(model_count_++) + ".om";
   auto model_desc = std::make_shared<hiai::AiModelDescription>(
       model_name, freq_level(), framework_type(), model_type(), device_type());
   model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length);
diff --git a/lite/backends/npu/device.h b/lite/backends/npu/device.h
index 411600ae0a38e4ee1b4a3ce3d6519b927eeb0a1a..6733a7f6dfa085d2c64274a81ba2a028ebe88f3f 100644
--- a/lite/backends/npu/device.h
+++ b/lite/backends/npu/device.h
@@ -40,8 +40,8 @@ class Device {
 
   // Build the HiAI IR graph to om model, return HiAI model manager client to
   // load om model and run inference.
-  std::unique_ptr<hiai::AiModelMngerClient> Build(
-      std::string& model_name,                 // NOLINT
+  std::shared_ptr<hiai::AiModelMngerClient> Build(
+      const std::string model_name,            // NOLINT
       std::vector<ge::Operator>& input_nodes,  // NOLINT
       std::vector<ge::Operator>& output_nodes  // NOLINT
       );                                       // NOLINT
@@ -51,7 +51,6 @@ class Device {
   int framework_type_{0};
   int model_type_{0};
   int device_type_{0};
-  int model_count_{0};
 };
 
 }  // namespace npu
diff --git a/lite/backends/opencl/CMakeLists.txt b/lite/backends/opencl/CMakeLists.txt
index dd7f6b417e0d6416eec9bb3e60ef088432776112..0ac8cf310370f34ae5743113efe1d71579979daf 100644
--- a/lite/backends/opencl/CMakeLists.txt
+++ b/lite/backends/opencl/CMakeLists.txt
@@ -2,17 +2,16 @@ if (NOT LITE_WITH_OPENCL)
     return()
 endif()
 
+lite_cc_library(opencl_kernels_source_cc SRCS opencl_kernels_source.cc)
 lite_cc_library(cl_wrapper SRCS cl_wrapper.cc)
 lite_cc_library(cl_utility SRCS cl_utility.cc DEPS cl_wrapper)
-lite_cc_library(cl_runtime SRCS cl_runtime.cc DEPS cl_utility)
+lite_cc_library(cl_runtime SRCS cl_runtime.cc DEPS cl_utility opencl_kernels_source_cc)
 lite_cc_library(cl_context SRCS cl_context.cc DEPS cl_runtime)
-lite_cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS tensor)
+lite_cc_library(cl_half SRCS cl_half.cc)
+lite_cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS tensor cl_half)
 lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runtime)
 lite_cc_library(cl_caller SRCS cl_caller.cc  DEPS cl_context cl_image)
 lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime)
-lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper)
 
 add_dependencies(cl_wrapper opencl_clhpp)
diff --git a/lite/backends/opencl/cl_caller.cc b/lite/backends/opencl/cl_caller.cc
index 6b9cab1056beaa6f516a0d3a202a7816c911f1b2..8421c784d5da224eacaaa9461b737eed1b4bdd4e 100644
--- a/lite/backends/opencl/cl_caller.cc
+++ b/lite/backends/opencl/cl_caller.cc
@@ -30,7 +30,7 @@ static void CopyImageData(CLContext* context,
   int width = cl_image.image_dims()[0];
   int height = cl_image.image_dims()[1];
 
-  float* image_data = new float[height * width * 4];
+  uint16_t* image_data = new uint16_t[height * width * 4];
   cl::Image* image = cl_image.cl_image();
   cl::array<size_t, 3> origin = {0, 0, 0};
   cl::array<size_t, 3> region = {
@@ -46,9 +46,8 @@ static void CopyImageData(CLContext* context,
   delete[] image_data;
 }
 
-bool InitOpenCLRuntime(std::string cl_path) {
+bool InitOpenCLRuntime() {
   auto* runtime = CLRuntime::Global();
-  runtime->set_cl_path(cl_path);
   return runtime->IsInitSuccess();
 }
 
diff --git a/lite/backends/opencl/cl_caller.h b/lite/backends/opencl/cl_caller.h
index 1817db9f6bd6d9ecf21978b8293bd9534328de0f..d1f1429e44f8872852797dadcbf2f82c1c9c0269 100644
--- a/lite/backends/opencl/cl_caller.h
+++ b/lite/backends/opencl/cl_caller.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace lite {
 
-bool InitOpenCLRuntime(std::string cl_path);
+bool InitOpenCLRuntime();
 
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc
index 0fcb99486eac57e36ee548b809f8f141e0807db8..f0105e060f03df3e4d49c358cf314730cdd16393 100644
--- a/lite/backends/opencl/cl_context.cc
+++ b/lite/backends/opencl/cl_context.cc
@@ -41,8 +41,7 @@ cl::Program &CLContext::GetProgram(const std::string &file_name,
     return *(it->second);
   }
 
-  auto program = CLRuntime::Global()->CreateProgram(
-      GetContext(), CLRuntime::Global()->cl_path() + "/cl_kernel/" + file_name);
+  auto program = CLRuntime::Global()->CreateProgram(GetContext(), file_name);
 
   VLOG(3) << " --- begin build program -> " << program_key << " --- ";
   CLRuntime::Global()->BuildProgram(program.get(), options);
@@ -122,5 +121,34 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
   }
 }
 
+cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
+                                     size_t max_work_size) {
+  int preferred_lws = 0;
+  int divisor = 2;
+
+  auto tmp0 = global_work_size[0];
+  auto tmp1 = global_work_size[1];
+  auto tmp2 = global_work_size[2];
+
+  if (divisor > 1) {
+    max_work_size /= divisor;
+  }
+  if (preferred_lws > 0 && preferred_lws <= max_work_size) {
+    max_work_size = preferred_lws;
+  }
+  while (tmp1 > max_work_size && max_work_size > 0) {
+    tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1;
+  }
+  while (tmp2 * tmp1 > max_work_size && max_work_size > 0) {
+    tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1;
+  }
+  while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) {
+    tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1;
+  }
+  return cl::NDRange{static_cast<size_t>(tmp0),
+                     static_cast<size_t>(tmp1),
+                     static_cast<size_t>(tmp2)};
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h
index a28f82f40ecd70a38fcd179e3c7dedfb02a6bcd1..1964c4bf56b55841ba735c79b2f7a17dc1ed451e 100644
--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -44,6 +44,8 @@ class CLContext {
 
   cl::NDRange DefaultWorkSize(const CLImage &image);
 
+  cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size);
+
  private:
   std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;
   std::vector<std::unique_ptr<cl::Kernel>> kernels_;
diff --git a/lite/backends/opencl/cl_functions_test.cc b/lite/backends/opencl/cl_functions_test.cc
index 70f47b47946641edf4d023437b48d46cae93ca6e..ba32d8c803bfd832289a936fe9150ba8d14cd984 100644
--- a/lite/backends/opencl/cl_functions_test.cc
+++ b/lite/backends/opencl/cl_functions_test.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <algorithm>
 #include <memory>
@@ -26,22 +25,18 @@ limitations under the License. */
 #include "lite/core/tensor.h"
 #include "lite/utils/cp_logging.h"
 
-DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path.");
-
 namespace paddle {
 namespace lite {
 
 TEST(cl_test, runtime_test) {
   auto *runtime = CLRuntime::Global();
   CHECK(runtime->IsInitSuccess());
-  runtime->set_cl_path(FLAGS_cl_path);
   runtime->platform();
   runtime->device();
   runtime->command_queue();
   auto &context = runtime->context();
-  auto program = runtime->CreateProgram(
-      context,
-      runtime->cl_path() + "/cl_kernel/" + "buffer/elementwise_add_kernel.cl");
+  auto program =
+      runtime->CreateProgram(context, "buffer/elementwise_add_kernel.cl");
   auto event = runtime->CreateEvent(context);
   const std::string build_option("-DCL_DTYPE_float");
   CHECK(runtime->BuildProgram(program.get(), build_option));
@@ -50,7 +45,6 @@ TEST(cl_test, runtime_test) {
 TEST(cl_test, context_test) {
   auto *runtime = CLRuntime::Global();
   CHECK(runtime->IsInitSuccess());
-  runtime->set_cl_path(FLAGS_cl_path);
   CLContext context;
   context.AddKernel("pool_max", "image/pool_kernel.cl", "-DCL_DTYPE_float");
   context.AddKernel(
@@ -62,7 +56,6 @@ TEST(cl_test, context_test) {
 TEST(cl_test, kernel_test) {
   auto *runtime = CLRuntime::Global();
   CHECK(runtime->IsInitSuccess());
-  runtime->set_cl_path(FLAGS_cl_path);
   std::unique_ptr<CLContext> context(new CLContext);
   context->AddKernel(
       "elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float");
@@ -121,7 +114,7 @@ TEST(cl_test, kernel_test) {
 }
 
 TEST(cl_test, target_wrapper_buffer_test) {
-  bool inited = InitOpenCLRuntime(FLAGS_cl_path);
+  bool inited = InitOpenCLRuntime();
   CHECK(inited) << "Fail to initialize OpenCL runtime.";
   std::unique_ptr<CLContext> context(new CLContext);
   std::string kernel_name = "elementwise_add";
diff --git a/lite/backends/opencl/cl_half.cc b/lite/backends/opencl/cl_half.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0f27cae549c30eb7295a7c9490d9fb106883dda7
--- /dev/null
+++ b/lite/backends/opencl/cl_half.cc
@@ -0,0 +1,518 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/opencl/cl_half.h"
+
+namespace paddle {
+namespace lite {
+
+// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
+
+static const uint32_t mantissatable[2048] = {
+    0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000,
+    0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000,
+    0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000,
+    0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000,
+    0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000,
+    0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000,
+    0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000,
+    0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000,
+    0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000,
+    0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000,
+    0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000,
+    0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000,
+    0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000,
+    0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000,
+    0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000,
+    0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000,
+    0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000,
+    0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000,
+    0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000,
+    0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000,
+    0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000,
+    0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000,
+    0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000,
+    0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000,
+    0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000,
+    0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000,
+    0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000,
+    0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000,
+    0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000,
+    0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000,
+    0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000,
+    0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000,
+    0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000,
+    0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000,
+    0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000,
+    0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000,
+    0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000,
+    0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000,
+    0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000,
+    0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000,
+    0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000,
+    0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000,
+    0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000,
+    0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000,
+    0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000,
+    0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000,
+    0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000,
+    0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000,
+    0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000,
+    0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000,
+    0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000,
+    0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000,
+    0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000,
+    0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000,
+    0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000,
+    0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000,
+    0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000,
+    0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000,
+    0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000,
+    0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000,
+    0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000,
+    0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000,
+    0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000,
+    0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000,
+    0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000,
+    0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000,
+    0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000,
+    0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000,
+    0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000,
+    0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000,
+    0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000,
+    0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000,
+    0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000,
+    0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000,
+    0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000,
+    0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000,
+    0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000,
+    0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000,
+    0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000,
+    0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000,
+    0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000,
+    0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000,
+    0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000,
+    0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000,
+    0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000,
+    0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000,
+    0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000,
+    0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000,
+    0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000,
+    0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000,
+    0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000,
+    0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000,
+    0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000,
+    0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000,
+    0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000,
+    0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000,
+    0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000,
+    0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000,
+    0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000,
+    0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000,
+    0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000,
+    0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000,
+    0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000,
+    0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000,
+    0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000,
+    0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000,
+    0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000,
+    0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000,
+    0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000,
+    0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000,
+    0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000,
+    0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000,
+    0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000,
+    0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000,
+    0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000,
+    0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000,
+    0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000,
+    0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000,
+    0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000,
+    0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000,
+    0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000,
+    0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000,
+    0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000,
+    0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000,
+    0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000,
+    0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000,
+    0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000,
+    0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000,
+    0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000,
+    0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000,
+    0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000,
+    0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000,
+    0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000,
+    0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000,
+    0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000,
+    0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000,
+    0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000,
+    0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000,
+    0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000,
+    0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000,
+    0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000,
+    0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000,
+    0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000,
+    0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000,
+    0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000,
+    0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000,
+    0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000,
+    0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000,
+    0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000,
+    0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000,
+    0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000,
+    0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000,
+    0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000,
+    0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000,
+    0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000,
+    0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000,
+    0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000,
+    0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000,
+    0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000,
+    0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000,
+    0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000,
+    0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000,
+    0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000,
+    0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000,
+    0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000,
+    0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000,
+    0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000,
+    0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000,
+    0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000,
+    0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000,
+    0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000,
+    0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000,
+    0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000,
+    0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000,
+    0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000,
+    0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000,
+    0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000,
+    0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000,
+    0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000,
+    0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000,
+    0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000,
+    0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000,
+    0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000,
+    0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000,
+    0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000,
+    0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000,
+    0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000,
+    0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000,
+    0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000,
+    0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000,
+    0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000,
+    0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000,
+    0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000,
+    0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000,
+    0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000,
+    0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000,
+    0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000,
+    0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000,
+    0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000,
+    0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000,
+    0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000,
+    0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000,
+    0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000,
+    0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000,
+    0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000,
+    0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000,
+    0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000,
+    0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000,
+    0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000,
+    0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000,
+    0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000,
+    0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000,
+    0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000,
+    0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000,
+    0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000,
+    0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000,
+    0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000,
+    0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000,
+    0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000,
+    0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000,
+    0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000,
+    0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000,
+    0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000,
+    0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000,
+    0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000,
+    0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000,
+    0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000,
+    0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000,
+    0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000,
+    0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000,
+    0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000,
+    0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000,
+    0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000,
+    0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000,
+    0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000,
+    0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000,
+    0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000,
+    0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000,
+    0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000,
+    0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000,
+    0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000,
+    0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000,
+    0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000,
+    0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000,
+    0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000,
+    0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000,
+    0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000,
+    0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000,
+    0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000,
+    0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000,
+    0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000,
+    0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000,
+    0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000,
+    0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000,
+    0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000,
+    0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000,
+    0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000,
+    0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000,
+    0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000,
+    0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000,
+    0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000,
+    0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000,
+    0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000,
+    0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000,
+    0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000,
+    0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000,
+    0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000,
+    0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000,
+    0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000,
+    0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000,
+    0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000,
+    0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000,
+    0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000,
+    0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000,
+    0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000,
+    0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000,
+    0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000,
+    0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000,
+    0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000,
+    0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000,
+    0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000,
+    0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000,
+    0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000,
+    0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000,
+    0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000,
+    0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000,
+    0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000,
+    0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000,
+    0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000,
+    0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000,
+    0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000,
+    0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000,
+    0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000,
+    0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000,
+    0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000,
+    0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000,
+    0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000,
+    0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000,
+    0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000,
+    0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000,
+    0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000,
+    0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000,
+    0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000,
+    0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000,
+    0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000,
+    0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000,
+    0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000,
+    0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000,
+    0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000,
+    0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000,
+    0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000,
+    0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000,
+    0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000,
+    0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000,
+    0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000,
+    0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000,
+    0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000,
+    0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000,
+    0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000,
+    0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000,
+    0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000,
+    0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000,
+    0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000,
+    0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000,
+    0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000,
+    0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000,
+    0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000,
+    0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000,
+    0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000,
+    0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000,
+    0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000,
+    0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000,
+    0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000,
+    0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000,
+    0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000,
+    0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000,
+    0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000,
+    0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000,
+    0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000,
+    0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000,
+    0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000,
+    0x387fc000, 0x387fe000};
+
+static const uint16_t offsettable[64] = {
+    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400};
+
+static const uint32_t exponenttable[64] = {
+    0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000,
+    0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000,
+    0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000,
+    0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000,
+    0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000,
+    0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000,
+    0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000,
+    0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
+    0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000,
+    0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000,
+    0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000};
+
+static const uint16_t basetable[512] = {
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010,
+    0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000,
+    0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400,
+    0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800,
+    0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001,
+    0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200,
+    0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400,
+    0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800,
+    0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00,
+    0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00};
+
+static const uint8_t shifttable[512] = {
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13,
+    0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17,
+    0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+    0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d};
+
+half_t Float2Half(float f) {
+  uint32_t v = *reinterpret_cast<uint32_t *>(&f);
+  return basetable[(v >> 23) & 0x1ff] +
+         ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]);
+}
+
+float Half2Float(half_t h) {
+  uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] +
+               exponenttable[h >> 10];
+  return *reinterpret_cast<float *>(&v);
+}
+
+void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) {
+  for (int i = 0; i < count; ++i) {
+    h_array[i] = Float2Half(f_array[i]);
+  }
+}
+
+void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) {
+  for (int i = 0; i < count; ++i) {
+    f_array[i] = Half2Float(h_array[i]);
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/opencl/cl_kernel/image/relu_kernel.cl b/lite/backends/opencl/cl_half.h
similarity index 52%
rename from lite/backends/opencl/cl_kernel/image/relu_kernel.cl
rename to lite/backends/opencl/cl_half.h
index 43a27067c2f2c418d314f9bce95bccbbb51a9be0..0dcf325db2bc13b8fff68f1e777d4680d937abce 100644
--- a/lite/backends/opencl/cl_kernel/image/relu_kernel.cl
+++ b/lite/backends/opencl/cl_half.h
@@ -12,19 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cl_common.h>
+#pragma once
+#include <cstdint>
 
-__kernel void relu(__read_only image2d_t input,
-                   __write_only image2d_t output) {
+namespace paddle {
+namespace lite {
 
-  const int x = get_global_id(0); // image_width
-  const int y = get_global_id(1); // image_height
+typedef uint16_t half_t;
 
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
+half_t Float2Half(float f);
 
-  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
-  in = max((CL_DTYPE4)(0.0f), in);
-  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
-}
+float Half2Float(half_t h);
+
+void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
+
+void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/opencl/cl_image.cc b/lite/backends/opencl/cl_image.cc
index b67f4040bff4cac15624c1440ca741d2b9dfa6ba..1e21b3d03a4a231f4bb171e83f4038e7922fe19a 100644
--- a/lite/backends/opencl/cl_image.cc
+++ b/lite/backends/opencl/cl_image.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "lite/backends/opencl/cl_image.h"
+#include <iostream>
+#include "lite/backends/opencl/cl_half.h"
 #include "lite/backends/opencl/cl_runtime.h"
 #include "lite/backends/opencl/cl_utility.h"
 #include "lite/utils/cp_logging.h"
@@ -24,7 +26,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
   int width = cl_image.image_dims_[0];
   int height = cl_image.image_dims_[1];
 
-  float* image_data = new float[height * width * 4];
+  uint16_t* image_data = new uint16_t[height * width * 4];
   cl::Image* image = cl_image.cl_image();
 
   cl::array<size_t, 3> origin = {0, 0, 0};
@@ -41,7 +43,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
   int stride = cl_image.numel() / 20;
   stride = stride > 0 ? stride : 1;
 
-  os << " dims: " << cl_image.tensor_dims_ << "\n";
+  os << " dims: ";  // << cl_image.tensor_dims_ << "\n";
   for (int i = 0; i < cl_image.numel(); i += stride) {
     os << tensor_data[i] << " ";
   }
@@ -123,7 +125,7 @@ void CLImage::InitCLImage(const cl::Context& context,
   VLOG(3) << " begin init cl image ";
   image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
 
-  float* image_data = new float[image_dims_.production() * 4];
+  uint16_t* image_data = new uint16_t[image_dims_.production() * 4];
 
   VLOG(3) << " convert to image ";
   converter->NCHWToImage(tensor_data_.get(), image_data, tensor_dims_);
diff --git a/lite/backends/opencl/cl_image_converter.cc b/lite/backends/opencl/cl_image_converter.cc
index 402f710d7a226de089134b4abc41dc41027e0da1..7e6f83a4d12f82c780b8e2a8ba582d6a13d8dc07 100644
--- a/lite/backends/opencl/cl_image_converter.cc
+++ b/lite/backends/opencl/cl_image_converter.cc
@@ -37,7 +37,7 @@ DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) {
 }
 
 void CLImageConverterDefault::NCHWToImage(float *nchw,
-                                          float *image,
+                                          half_t *image,
                                           const DDim &tensor_dim) {
   size_t new_dims[] = {1, 1, 1, 1};
   for (size_t j = 0; j < tensor_dim.size(); ++j) {
@@ -69,7 +69,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
           if (c < C) {
             // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
             // (c % 4);
-            image[i2] = *p;
+            image[i2] = Float2Half(*p);
             i2 += 4;
             p++;
           } else {
@@ -84,7 +84,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
   }
 }
 
-void CLImageConverterDefault::ImageToNCHW(float *image,
+void CLImageConverterDefault::ImageToNCHW(half_t *image,
                                           float *tensor,
                                           const DDim &image_dim,
                                           const DDim &tensor_dim) {
@@ -109,7 +109,7 @@ void CLImageConverterDefault::ImageToNCHW(float *image,
       for (size_t h = 0; h < H; h++) {
         size_t i2 = (i1 << 2) + c % 4;
         for (size_t w = 0; w < W; w++) {
-          *p = image[i2];
+          *p = Half2Float(image[i2]);
           i2 += 4;
           p++;
         }
@@ -164,7 +164,7 @@ DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) {
 }
 
 void CLImageConverterFolder::NCHWToImage(float *tensor,
-                                         float *image,
+                                         half_t *image,
                                          const DDim &tensor_dim) {
   CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
       << " Tensor dim is not support!";
@@ -187,13 +187,14 @@ void CLImageConverterFolder::NCHWToImage(float *tensor,
 
     for (size_t h = 0; h < tdim[0]; h++) {
       for (size_t w = 0; w < tdim[1]; w++) {
-        image[(h * width + w / 4) * 4 + (w % 4)] = tensor[h * tdim[1] + w];
+        image[(h * width + w / 4) * 4 + (w % 4)] =
+            Float2Half(tensor[h * tdim[1] + w]);
       }
     }
   }
 }
 
-void CLImageConverterFolder::ImageToNCHW(float *image,
+void CLImageConverterFolder::ImageToNCHW(half_t *image,
                                          float *tensor,
                                          const DDim &image_dim,
                                          const DDim &tensor_dim) {
@@ -216,7 +217,7 @@ void CLImageConverterFolder::ImageToNCHW(float *image,
 
     for (size_t h = 0; h < H; h++) {
       for (size_t w = 0; w < W; w++) {
-        p[h * W + w] = image[(h * width + w / 4) * 4 + (w % 4)];
+        p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]);
       }
     }
   }
@@ -237,7 +238,7 @@ DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
 }
 
 void CLImageConverterNWBlock::NCHWToImage(float *tensor,
-                                          float *image,
+                                          half_t *image,
                                           const DDim &tensor_dim) {
   CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
   auto image_dim = InitImageDimInfoWith(tensor_dim);
@@ -257,7 +258,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
           size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
                          w * 4 + n % 4;
           if (n < N) {
-            image[index] = *p;
+            image[index] = Float2Half(*p);
             p++;
           } else {
             image[index] = 0.0;
@@ -272,7 +273,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
   VLOG(3) << " init done";
 }
 
-void CLImageConverterNWBlock::ImageToNCHW(float *image,
+void CLImageConverterNWBlock::ImageToNCHW(half_t *image,
                                           float *tensor,
                                           const DDim &image_dim,
                                           const DDim &tensor_dim) {
@@ -291,7 +292,7 @@ void CLImageConverterNWBlock::ImageToNCHW(float *image,
         for (size_t w = 0; w < W; ++w) {
           size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
                          w * 4 + n % 4;
-          *p = image[index];
+          *p = Half2Float(image[index]);
           p++;
           if (index >= (width * height * 4)) {
             LOG(INFO) << " index out of range ";
@@ -318,7 +319,7 @@ DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
 }
 
 void CLImageConverterDWBlock::NCHWToImage(float *tensor,
-                                          float *image,
+                                          half_t *image,
                                           const DDim &tensor_dim) {
   size_t new_dims[] = {1, 1, 1, 1};
   for (size_t j = 0; j < tensor_dim.size(); ++j) {
@@ -350,7 +351,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor,
           if (c < C) {
             // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
             // (c % 4);
-            image[i2] = *p;
+            image[i2] = Float2Half(*p);
             i2 += 4;
             p++;
           } else {
@@ -365,7 +366,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor,
   }
 }
 
-void CLImageConverterDWBlock::ImageToNCHW(float *image,
+void CLImageConverterDWBlock::ImageToNCHW(half_t *image,
                                           float *tensor,
                                           const DDim &image_dim,
                                           const DDim &tensor_dim) {
@@ -384,7 +385,7 @@ void CLImageConverterDWBlock::ImageToNCHW(float *image,
       for (size_t h = 0; h < H; h++) {
         size_t i2 = (i1 << 2) + c % 4;
         for (size_t w = 0; w < W; w++) {
-          *p = image[i2];
+          *p = Half2Float(image[i2]);
           i2 += 4;
           p++;
         }
@@ -418,7 +419,7 @@ DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) {
 }
 
 void CLImageConverterNormal::NCHWToImage(float *tensor,
-                                         float *image,
+                                         half_t *image,
                                          const DDim &tensor_dim) {
   CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
       << " Tensor dim is not support!";
@@ -427,7 +428,7 @@ void CLImageConverterNormal::NCHWToImage(float *tensor,
   default_converter.NCHWToImage(tensor, image, tensor_dim);
 }
 
-void CLImageConverterNormal::ImageToNCHW(float *image,
+void CLImageConverterNormal::ImageToNCHW(half_t *image,
                                          float *tensor,
                                          const DDim &image_dim,
                                          const DDim &tensor_dim) {
@@ -449,10 +450,10 @@ DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith(
 }
 
 void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor,
-                                                  float *image,
+                                                  half_t *image,
                                                   const DDim &tensor_dim) {}
 
-void CLImageConverterWinoTransWeight::ImageToNCHW(float *image,
+void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image,
                                                   float *tensor,
                                                   const DDim &image_dim,
                                                   const DDim &tensor_dim) {}
diff --git a/lite/backends/opencl/cl_image_converter.h b/lite/backends/opencl/cl_image_converter.h
index 962eb8d3ef35bdb603aa4a56181b1124885d5506..bb8602f6adae377f21c8fe92448e8feae64a773f 100644
--- a/lite/backends/opencl/cl_image_converter.h
+++ b/lite/backends/opencl/cl_image_converter.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "lite/backends/opencl/cl_half.h"
 #include "lite/core/tensor.h"
 
 namespace paddle {
@@ -24,10 +25,10 @@ class CLImageConverterBase {
   virtual ~CLImageConverterBase() {}
 
   virtual void NCHWToImage(float *nchw,
-                           float *image,
+                           half_t *image,
                            const DDim &tensor_dim) = 0;
 
-  virtual void ImageToNCHW(float *image,
+  virtual void ImageToNCHW(half_t *image,
                            float *nchw,
                            const DDim &image_dim,
                            const DDim &tensor_dim) = 0;
@@ -37,8 +38,8 @@ class CLImageConverterBase {
 class CLImageConverterDefault : public CLImageConverterBase {
  public:
   DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
-  void NCHWToImage(float *nchw, float *image, const DDim &tensor_dim) override;
-  void ImageToNCHW(float *image,
+  void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim) override;
+  void ImageToNCHW(half_t *image,
                    float *tensor,
                    const DDim &image_dim,
                    const DDim &tensor_dim) override;
@@ -48,9 +49,9 @@ class CLImageConverterFolder : public CLImageConverterBase {
  public:
   DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
   void NCHWToImage(float *tensor,
-                   float *image,
+                   half_t *image,
                    const DDim &tensor_dim) override;
-  void ImageToNCHW(float *image,
+  void ImageToNCHW(half_t *image,
                    float *tensor,
                    const DDim &image_dim,
                    const DDim &tensor_dim) override;
@@ -77,9 +78,9 @@ class CLImageConverterNormal : public CLImageConverterBase {
  public:
   DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
   void NCHWToImage(float *tensor,
-                   float *image,
+                   half_t *image,
                    const DDim &tensor_dim) override;
-  void ImageToNCHW(float *image,
+  void ImageToNCHW(half_t *image,
                    float *tensor,
                    const DDim &image_dim,
                    const DDim &tensor_dim) override;
@@ -106,9 +107,9 @@ class CLImageConverterNWBlock : public CLImageConverterBase {
  public:
   DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
   void NCHWToImage(float *tensor,
-                   float *image,
+                   half_t *image,
                    const DDim &tensor_dim) override;
-  void ImageToNCHW(float *image,
+  void ImageToNCHW(half_t *image,
                    float *tensor,
                    const DDim &image_dim,
                    const DDim &tensor_dim) override;
@@ -117,9 +118,9 @@ class CLImageConverterDWBlock : public CLImageConverterBase {
  public:
   DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
   void NCHWToImage(float *tensor,
-                   float *image,
+                   half_t *image,
                    const DDim &tensor_dim) override;
-  void ImageToNCHW(float *image,
+  void ImageToNCHW(half_t *image,
                    float *tensor,
                    const DDim &image_dim,
                    const DDim &tensor_dim) override;
@@ -129,9 +130,9 @@ class CLImageConverterWinoTransWeight : public CLImageConverterBase {
  public:
   DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
   void NCHWToImage(float *tensor,
-                   float *image,
+                   half_t *image,
                    const DDim &tensor_dim) override;
-  void ImageToNCHW(float *image,
+  void ImageToNCHW(half_t *image,
                    float *tensor,
                    const DDim &image_dim,
                    const DDim &tensor_dim) override;
diff --git a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
index b8dbf62c06f85ef6237378d8ceab37f8fa2cd69f..a14748c69f3eafce515c90f2b8a226703fe5883d 100644
--- a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
@@ -91,11 +91,7 @@ void gemm_batch_naive(__global const CL_DTYPE* a,
     c0 += a0 * b0;
   }
 
-#ifdef RELU
   cur_c[row * N + col] = activation(c0);
-#else
-  cur_c[row * N + col] = c0;
-#endif
 }
 
 
@@ -103,7 +99,7 @@ void gemm_batch_naive(__global const CL_DTYPE* a,
 // a: filter_d
 // b: x_d
 // c: output_d
-
+#if 0 // TODO(ysh239): cause CL_OUT_OF_HOST_MEMORY on some devices(such as snapdragon 855)
 //#define PRINT_KERNEL
 __kernel
 void gemm_batch(__global const CL_DTYPE* Aptr,
@@ -213,7 +209,7 @@ void gemm_batch(__global const CL_DTYPE* Aptr,
         }
     }
 }
-
+#endif
 
 // fc_gemv_naive: keep for check
 // used for fc with M = 1
@@ -259,7 +255,7 @@ void fc_gemv_1x4(__global const CL_DTYPE* a,
     const int col = get_global_id(0) << 2; // gws[0]: [0, N >> 2) height of B == N
 
     if (col + 3 < N) {
-        CL_DTYPE4 c0 = 0.0f;
+        half4 c0 = 0.0f;
         if (bias) {
             c0.x = bias[col];
             c0.y = bias[col+1];
@@ -270,11 +266,12 @@ void fc_gemv_1x4(__global const CL_DTYPE* a,
         // main loop of K
         int p = 0;
         for (; p < K - 3; p += 4) {
-            CL_DTYPE4 a0 = vload4(0, a + p);
-            CL_DTYPE4 b0 = vload4(0, b + p * N + col);
-            CL_DTYPE4 b1 = vload4(0, b + (p+1) * N + col);
-            CL_DTYPE4 b2 = vload4(0, b + (p+2) * N + col);
-            CL_DTYPE4 b3 = vload4(0, b + (p+3) * N + col);
+            half4 a0 = convert_half4(vload4(0, a + p));
+
+            half4 b0 = convert_half4(vload4(0, b + p * N + col));
+            half4 b1 = convert_half4(vload4(0, b + (p+1) * N + col));
+            half4 b2 = convert_half4(vload4(0, b + (p+2) * N + col));
+            half4 b3 = convert_half4(vload4(0, b + (p+3) * N + col));
 
             c0 += a0.x * b0;
             c0 += a0.y * b1;
@@ -283,21 +280,21 @@ void fc_gemv_1x4(__global const CL_DTYPE* a,
         }
 
         // compute left K
-        CL_DTYPE4 b2 = 0.0f,
-                  b1 = 0.0f,
-                  b0 = 0.0f,
-                  a0 = 0.0f;
+        half4 b2 = 0.0f,
+                          b1 = 0.0f,
+                          b0 = 0.0f,
+                          a0 = 0.0f;
         switch (K - p) {
             case 3: {
-                b2 = vload4(0, b + (p+2) * N + col);
+                b2 = convert_half4(vload4(0, b + (p+2) * N + col));
                 a0.z = a[p + 2];
             }
             case 2: {
-                b1 = vload4(0, b + (p+1) * N + col);
+                b1 = convert_half4(vload4(0, b + (p+1) * N + col));
                 a0.y = a[p + 1];
             }
             case 1: {
-                b0 = vload4(0, b + (p) * N + col);
+                b0 = convert_half4(vload4(0, b + (p) * N + col));
                 a0.x = a[p];
             }
         }
@@ -308,7 +305,8 @@ void fc_gemv_1x4(__global const CL_DTYPE* a,
         // store res
 #ifdef RELU
        if (col % 4 == 0) {
-            vstore4(fmax(c0, (CL_DTYPE4)0.f), 0, c + col);
+            float4 act_res = convert_float4(fmax(c0, (half4)0.f));
+            vstore4(act_res, 0, c + col);
         } else {
             switch (col % 4) {
                 case 3:
@@ -321,7 +319,7 @@ void fc_gemv_1x4(__global const CL_DTYPE* a,
         }
 #else
        if (col % 4 == 0) {
-            vstore4(c0, 0, c + col);
+            vstore4(convert_float4(c0), 0, c + col);
         } else {
             switch (col % 4) {
                 case 3:
@@ -336,10 +334,10 @@ void fc_gemv_1x4(__global const CL_DTYPE* a,
     } else {
        const int left_col = N - col;
        for (int col_offset = 0; col_offset < left_col; ++col_offset) {
-           CL_DTYPE c0 = bias ? bias[col] : 0;
+           half c0 = bias ? bias[col] : 0;
            for (int p = 0; p < K; ++p) {
-               CL_DTYPE b0 = *(b + p * N + col + col_offset);
-               CL_DTYPE a0 = *(a + p);
+               half b0 = *(b + p * N + col + col_offset);
+               half a0 = *(a + p);
                c0 += a0 * b0;
            }
 #ifdef RELU
@@ -366,18 +364,18 @@ void fc_gemm_4x4(__global const CL_DTYPE* a,
     const int col = get_global_id(1) << 2; // id: [0, N>>2) width of out == N
 
     if (row+3 < M && col+3 < N) {
-        CL_DTYPE bias0 = bias ? bias[col]   : 0,
-                 bias1 = bias ? bias[col+1] : 0,
-                 bias2 = bias ? bias[col+2] : 0,
-                 bias3 = bias ? bias[col+3] : 0;
+        CL_COMPUTE_DTYPE bias0 = bias ? bias[col]   : 0,
+                         bias1 = bias ? bias[col+1] : 0,
+                         bias2 = bias ? bias[col+2] : 0,
+                         bias3 = bias ? bias[col+3] : 0;
 
-        CL_DTYPE c00 = bias0, c01 = bias1, c02 = bias2, c03 = bias3,
-                 c10 = bias0, c11 = bias1, c12 = bias2, c13 = bias3,
-                 c20 = bias0, c21 = bias1, c22 = bias2, c23 = bias3,
-                 c30 = bias0, c31 = bias1, c32 = bias2, c33 = bias3;
+        CL_COMPUTE_DTYPE c00 = bias0, c01 = bias1, c02 = bias2, c03 = bias3,
+                         c10 = bias0, c11 = bias1, c12 = bias2, c13 = bias3,
+                         c20 = bias0, c21 = bias1, c22 = bias2, c23 = bias3,
+                         c30 = bias0, c31 = bias1, c32 = bias2, c33 = bias3;
 
        for (int p = 0; p < K; ++p) {
-            CL_DTYPE
+            CL_COMPUTE_DTYPE
                 a00 = *(a + row       * K + p),
                 a10 = *(a + (row + 1) * K + p),
                 a20 = *(a + (row + 2) * K + p),
@@ -407,7 +405,7 @@ void fc_gemm_4x4(__global const CL_DTYPE* a,
     } else {
         for (int cidx = col; cidx < N; ++cidx) {
             for (int ridx = row; ridx < M; ++ridx) {
-                CL_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0;
+                CL_COMPUTE_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0;
                 for (int p = 0; p < K; ++p) {
                     a0 = *(a + ridx * K + p);
                     b0 = *(b + p * N + cidx),
diff --git a/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl
index fe71f4c6ff8856ca679f2e6b29fc20a0d64da9ac..8d3456fa66973b04eaf24a04a42615790a133ddb 100644
--- a/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/im2col_kernel.cl
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #define CL_DTYPE float
 
+#include <cl_common.h>
+
 __kernel
 void im2col(__global const CL_DTYPE* data_im, const int img_offset,
             const int col_chw,
diff --git a/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl
deleted file mode 100644
index 532f947dd342b1ee4db69a084111a97ec014237f..0000000000000000000000000000000000000000
--- a/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cl_common.h>
-
-// buffer -> image2d
-__kernel void buffer_to_image2d(__global CL_DTYPE *in,
-                                __write_only image2d_t output_image,
-                                __private const int out_H,
-                                __private const int out_W,
-                                __private const int out_C,
-                                __private const int Stride0,
-                                __private const int Stride1,
-                                __private const int Stride2) {
-
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-  const int out_n = out_nh / out_H;
-  const int out_h = out_nh % out_H;
-
-  const int in_n = out_n;
-  const int in_c0 = out_c * 4 + 0;
-  const int in_c1 = out_c * 4 + 1;
-  const int in_c2 = out_c * 4 + 2;
-  const int in_c3 = out_c * 4 + 3;
-  const int in_h = out_h;
-  const int in_w = out_w;
-
-  int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w;
-  int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w;
-  int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w;
-  int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w;
-
-  int2 output_pos;
-  output_pos.x = out_c * out_W + out_w;
-  output_pos.y = out_nh;
-
-  CL_DTYPE4 output = (CL_DTYPE4)0.0f;
-  output.x = convert_float(in[input_pos0]);
-  if(out_C - 4 * out_c >= 2){
-    output.y = convert_float(in[input_pos1]);
-  }
-  if(out_C - 4 * out_c >= 3){
-    output.z = convert_float(in[input_pos2]);
-  }
-  if(out_C - 4 * out_c >= 4){
-    output.w = convert_float(in[input_pos3]);
-  }
-  write_imagef(output_image, output_pos, output);
-}
-
-// buffer -> image2d_nw
-__kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
-                                __write_only image2d_t output_image,
-                                __private const int out_H,
-                                __private const int out_W,
-                                __private const int out_N,
-                                __private const int Stride0,
-                                __private const int Stride1,
-                                __private const int Stride2) {
-  const int out_n = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_ch = get_global_id(2);
-
-  const int out_c = out_ch / out_H;
-  const int out_h = out_ch % out_H;
-
-  const int in_c = out_c; //  index of c in h direction
-
-  const int in_n0 = out_n * 4 + 0;
-  const int in_n1 = out_n * 4 + 1;
-  const int in_n2 = out_n * 4 + 2;
-  const int in_n3 = out_n * 4 + 3;
-
-  const int in_h = out_h;
-  const int in_w = out_w;
-
-  int input_pos0 = in_n0 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w;
-  int input_pos1 = in_n1 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w;
-  int input_pos2 = in_n2 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w;
-  int input_pos3 = in_n3 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w;
-
-  int2 output_pos;
-  output_pos.x = out_n * out_W + out_w;
-  output_pos.y = out_ch;
-
-  CL_DTYPE4 output = (CL_DTYPE4)0.0f;
-  output.x = convert_float(in[input_pos0]);
-  if (out_N - 4 * out_n >= 2) {
-    output.y = convert_float(in[input_pos1]);
-  }
-  if (out_N - 4 * out_n >= 3) {
-    output.z = convert_float(in[input_pos2]);
-  }
-  if (out_N - 4 * out_n >= 4) {
-    output.w = convert_float(in[input_pos3]);
-  }
-  write_imagef(output_image, output_pos, output);
-}
-
-
-
-// image2d -> buffer
-__kernel void image2d_to_buffer(__read_only image2d_t input,
-                                __private const int in_width,
-                                __private const int in_height,
-                                __global CL_DTYPE* out,
-                                __private const int size_ch,
-                                __private const int size_block,
-                                __private const int size_batch,
-                                __private const int C) {
-  const int in_c = get_global_id(0);
-  const int in_w = get_global_id(1);
-  const int in_nh = get_global_id(2);
-  const int in_n = in_nh / in_height;
-  const int in_h = in_nh % in_height;
-
-  const sampler_t sampler =
-    CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  const int pos_x = mad24(in_c, in_width, in_w);
-  CL_DTYPE4 in = read_imagef(input, sampler, (int2)(pos_x, in_nh));
-
-  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
-  out[index] = convert_float(in.x);
-  if (C - 4 * in_c >= 2) {
-    out[index + size_ch] = convert_float(in.y);
-  }
-  if(C - 4 * in_c >= 3) {
-    out[index + size_ch * 2] = convert_float(in.z);
-  }
-  if(C - 4 * in_c >= 4) {
-    out[index + size_ch * 3] = convert_float(in.w);
-  }
-}
-
-// image2d -> buffer
-__kernel void image2d_to_buffer_2d(__private const int in_height,
-                                   __private const int in_width,
-                                   __read_only image2d_t input,
-                                   __global CL_DTYPE* out) {
-  const int in_w = get_global_id(1);
-  const int in_h = get_global_id(2);
-
-  const sampler_t sampler =
-    CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  CL_DTYPE4 in = read_imagef(input, sampler, (int2)(in_w, in_h));
-
-  const int index = (in_h * in_width + in_w) * 4;
-  out[index] = convert_float(in.x);
-  out[index + 1] = convert_float(in.y);
-  out[index + 2] = convert_float(in.z);
-  out[index + 3] = convert_float(in.w);
-}
diff --git a/lite/backends/opencl/cl_kernel/cl_common.h b/lite/backends/opencl/cl_kernel/cl_common.h
index c127c6cec79cb2eb8d82ce6aa6190b23d373ff64..582e6a08b16ea7b5b8edd5850b1c9af04db56aad 100644
--- a/lite/backends/opencl/cl_kernel/cl_common.h
+++ b/lite/backends/opencl/cl_kernel/cl_common.h
@@ -29,11 +29,15 @@ limitations under the License. */
 #ifdef CL_DTYPE_float
 #define CL_DTYPE float
 #define CL_DTYPE_CHAR f
+#define CL_COMPUTE_DTYPE half
+#define CL_COMPUTE_DTYPE_CHAR h
 #endif
 
 #ifdef CL_DTYPE_half
 #define CL_DTYPE half
 #define CL_DTYPE_CHAR h
+#define CL_COMPUTE_DTYPE half
+#define CL_COMPUTE_DTYPE_CHAR h
 #endif
 
 /////////////////////////////////
@@ -43,6 +47,7 @@ limitations under the License. */
 #define GET_VEC_TYPE(type__, size__) type__##size__
 #define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__)
 #define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4)
+#define CL_COMPUTE_DTYPE4 VECTORIZED_TYPE(CL_COMPUTE_DTYPE, 4)
 
 /////////////////////////////////
 // CONVERT_TYPE_TO
diff --git a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..cb29860dc7556bdaea3c09589a8c6120c5ef2a1a
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl
@@ -0,0 +1,150 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+
+__kernel void relu(__read_only image2d_t input,
+                   __write_only image2d_t output,
+                   __private const float threshold,
+                   __private const float scale) {
+
+  const int x = get_global_id(0); // image_width
+  const int y = get_global_id(1); // image_height
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
+  in = max((CL_DTYPE4)(0.0f), in);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+}
+
+
+__kernel void relu6(__read_only image2d_t input,
+                    __write_only image2d_t output,
+                    __private const float threshold,
+                   __private const float scale){
+
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
+  in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
+  in = min((CL_DTYPE4)(threshold, threshold, threshold, threshold), in);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+}
+
+
+__kernel void sigmoid(__read_only image2d_t input,
+                      __write_only image2d_t output,
+                      __private const float threshold,
+                   __private const float scale) {
+
+  const int x = get_global_id(0); // image_width
+  const int y = get_global_id(1); // image_height
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
+  CL_DTYPE4 out = 1 / (1 + exp(-in));
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
+}
+
+__kernel void leaky_relu(__read_only image2d_t input,
+                      __write_only image2d_t output,
+                      __private const float threshold,
+                      __private const float scale) {
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
+  CL_DTYPE4 s_val = CONVERT_TYPE_TO(scale, CL_DTYPE) * in;
+  if (in.x < 0.0f){
+    in.x = s_val.x;
+  }
+  if (in.y < 0.0f){
+    in.y = s_val.y;
+  }
+  if (in.z < 0.0f){
+    in.z = s_val.z;
+  }
+  if (in.w < 0.0f){
+    in.w = s_val.w;
+  }
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+}
+
+__kernel void tanh_act(__read_only image2d_t input,
+                      __write_only image2d_t output,
+                      __private const float threshold,
+                      __private const float scale) {
+
+  const int x = get_global_id(0); // image_width
+  const int y = get_global_id(1); // image_height
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
+  CL_DTYPE4 out= (exp(in) - exp(-in))/ (exp(in) + exp(-in));
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
+}
+
+__kernel void exp_act(__read_only image2d_t input,
+                      __write_only image2d_t output,
+                      __private const float threshold,
+                   __private const float scale) {
+
+  const int x = get_global_id(0); // image_width
+  const int y = get_global_id(1); // image_height
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
+  CL_DTYPE4 out = exp(in);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
+}
+
+__kernel void swish(__read_only image2d_t input,
+                      __write_only image2d_t output,
+                      __private const float threshold,
+                   __private const float scale) {
+
+  const int x = get_global_id(0); // image_width
+  const int y = get_global_id(1); // image_height
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
+  CL_DTYPE4 out = in / (1 + exp(-(CL_DTYPE)scale * in));
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
+}
+
diff --git a/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl b/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..9427692f1267d363222295b33b6834e28517d0a4
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl
@@ -0,0 +1,96 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+
+__kernel void bilinear_interp(__read_only image2d_t input,
+                             __write_only image2d_t output,
+                             __private const float scale_h,
+                             __private const float scale_w,
+                             __private const float align_delta,
+                             __private const int in_dims_h,
+                             __private const int in_dims_w,
+                             __private const int out_dims_h,
+                             __private const int out_dims_w){
+    const int c = get_global_id(0);
+    const int w = get_global_id(1);
+    const int nh = get_global_id(2);
+
+    int2 output_pos;
+    output_pos.x = c * out_dims_w + w;
+    output_pos.y = nh;
+
+    // calculate center pixel's pos
+    int out_n = nh / out_dims_h;
+    int out_h = nh % out_dims_h;
+    float center_w = (w + align_delta)  * scale_w - align_delta;
+    float center_h = (out_h + align_delta) * scale_h - align_delta;
+
+    int floor_w = (int)center_w;
+    int floor_h = (int)center_h;
+    int ceil_w = floor_w + 1;
+    int ceil_h = floor_h + 1;
+    if (floor_w < 0){
+        floor_w = 0;
+    }
+    if (floor_h < 0){
+        floor_h = 0;
+    }
+    if (ceil_w > in_dims_w - 1) {
+        ceil_w = in_dims_w - 1;
+    }
+    if (ceil_h > in_dims_h - 1) {
+        ceil_h = in_dims_h- 1;
+    }
+    float wight0_w = center_w - floor_w;
+    float wight0_h = center_h - floor_h;
+    float wight1_w = 1.0 - wight0_w;
+    float wight1_h = 1.0 - wight0_h;
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+    // get left up pixel data
+    int2 left_up;
+    left_up.x = c * in_dims_w + floor_w;
+    left_up.y = out_n * in_dims_h + ceil_h;
+    CL_DTYPE4 left_up_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, left_up);
+
+
+    // get left down pixel data
+    int2 left_down;
+    left_down.x = c * in_dims_w + floor_w;
+    left_down.y = out_n * in_dims_h + floor_h;
+    CL_DTYPE4 left_down_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, left_down);
+
+    // get right up pixel data
+    int2 right_up;
+    right_up.x = c * in_dims_w + ceil_w;
+    right_up.y = out_n * in_dims_h + ceil_h;
+    CL_DTYPE4 right_up_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, right_up);
+
+    // get right down pixel's data
+    int2 right_down;
+    right_down.x = c * in_dims_w + ceil_w;
+    right_down.y = out_n * in_dims_h + floor_h;
+    CL_DTYPE4 right_down_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, right_down);
+
+    // calculate output data
+    CL_DTYPE4 out = (left_down_data * wight1_w + right_down_data * wight0_w) * wight1_h
+            + (left_up_data * wight1_w + right_up_data * wight0_w) * wight0_h;
+
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, out);
+}
diff --git a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
index f0335116f87aac34740dd22ac68f2b6265e62445..40cc52d54d0a9847ea71b017bdd3c633c74faa89 100644
--- a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
@@ -1,11 +1,8 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,50 +12,153 @@ limitations under the License. */
 #include <cl_common.h>
 
 __kernel void concat2(__read_only image2d_t input0,
-                    __read_only image2d_t input1,
-                    __write_only image2d_t output,
-                    int axis_size, int flag, int width) {
-  const int x = get_global_id(0); // image_width cxw/4
-  const int y = get_global_id(1); // image_height nxh
+                      __read_only image2d_t input1,
+                      __write_only image2d_t output,
+                      int flag, int C_0, int out_C, int out_W, int width) {
+  const int out_w = get_global_id(0); // image_width cxw/4
+  const int out_c = get_global_id(1); // image_width cxw/4
+  const int out_nh = get_global_id(2); // image_height nxh
 
   const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
                             CLK_ADDRESS_CLAMP |
                             CLK_FILTER_NEAREST;
-  int xx = x / width;
-  if (flag == 0){
-    xx = y / width;
+  if (flag == 1){ // by channel
+    int c_in = out_c;
+    int2 output_pos;
+    output_pos.x = out_c * out_W + out_w;
+    output_pos.y = out_nh;
+    CL_DTYPE4 output_data;
+    for (int i = 0; i < 4; i++) {
+      int c = out_c * 4 + i;
+      if (c >= out_C) {
+        break;
+      }
+      int c_in;
+      CL_DTYPE4 input_data;
+      if (c < C_0) {
+        c_in = c;
+        int2 input_pos;
+        input_pos.x = (c_in / 4) * out_W + out_w;
+        input_pos.y = out_nh;
+        input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, input_pos);
+      } else {
+        c_in = c - C_0;
+        int2 input_pos;
+        input_pos.x = (c_in / 4) * out_W + out_w;
+        input_pos.y = out_nh;
+        input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input1, sampler, input_pos);
+      }
+      int value_offset = c_in % 4;
+      CL_DTYPE value;
+      if (value_offset == 0) {
+        value = input_data.x;
+      } else if (value_offset == 1) {
+        value = input_data.y;
+      } else if (value_offset == 2) {
+        value = input_data.z;
+      } else if (value_offset == 3) {
+        value = input_data.w;
+      }
+      if (i == 0) {
+        output_data.x = value;
+      } else if (i == 1) {
+        output_data.y = value;
+      } else if (i == 2) {
+        output_data.z = value;
+      } else if (i == 3) {
+        output_data.w = value;
+      }
+    }
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, output_data);
+  }else if (flag == 2){ // by height,  width == n
+    int2 input_pos;
+    input_pos.x = out_c * out_W + out_w;
+    int h = out_nh / width;
+    CL_DTYPE4 input;
+    if (h < C_0){
+      input_pos.y = out_nh;
+      input = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, input_pos);
+    }else{
+      input_pos.y = (h - C_0) * width;
+      input = READ_IMG_TYPE(CL_DTYPE_CHAR, input1, sampler, input_pos);
+    }
+    int2 output_pos;
+    output_pos.x = out_c * out_W + out_w;
+    output_pos.y = out_nh;
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input);
+  }else if (flag == 3){ // by width, width == C
+    int2 input_pos;
+    input_pos.y = out_nh;
+    CL_DTYPE4 input;
+    if (out_w < C_0){
+      input_pos.x = out_c * out_W + out_w;
+      input = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, input_pos);
+    }else{
+      input_pos.x = out_c * out_W + (out_w - C_0);
+      input = READ_IMG_TYPE(CL_DTYPE_CHAR, input1, sampler, input_pos);
+    }
+    int2 output_pos;
+    output_pos.x = out_c * out_W + out_w;
+    output_pos.y = out_nh;
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input);
   }
-  if (xx < axis_size){
-    CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(x, y));
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
-  }else{
-    int new_val = xx - axis_size;
-    new_val *= width;
-    CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(new_val, y));
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
-  }
-  // WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }
 
-__kernel void concat_mul(__read_only image2d_t input0,
-                    __write_only image2d_t output,
-                    int axis_size, int flag, int width, int start) {
-  const int x = get_global_id(0); // image_width cxw/4
-  const int y = get_global_id(1); // image_height nxh
+__kernel void concat_mul(__read_only image2d_t input,
+                         __write_only image2d_t output,
+                         int flag, int C_0, int out_C, int out_W, int in_W, int width) {
+  const int in_w = get_global_id(0); // image_width cxw/4
+  const int in_c = get_global_id(1); // image_width cxw/4
+  const int in_nh = get_global_id(2); // image_height nxh
 
   const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
                             CLK_ADDRESS_CLAMP |
                             CLK_FILTER_NEAREST;
-  int xx = x / width;
-  if (flag == 0){
-    xx = y / width;
-  }
-  
-  if (xx < axis_size && xx >= start){
-    xx -= start;
-   xx *= width;
-    CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input0, sampler, (int2)(xx, y));
-    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
+  int2 input_pos;
+  int2 output_pos;
+  input_pos.x = in_c * in_W + in_w;
+  input_pos.y = in_nh;
+  CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+  if (flag == 1){ // by channel
+    CL_DTYPE4 output_data;
+    for (int i = 0; i < 4; i++) {
+      int c_out = C_0 + in_c * 4 + i;
+      if (c_out >= out_C) {
+        break;
+      }
+      int2 output_pos;
+      output_pos.x = (c_out / 4) * in_W + in_w;
+      output_pos.y = in_nh;
+      CL_DTYPE val;
+      if (i == 0) {
+        val = input_data.x;
+      } else if (i == 1) {
+        val = input_data.y;
+      } else if (i == 2) {
+        val = input_data.z;
+      } else if (i == 3) {
+        val = input_data.w;
+      }
+      if (c_out % 4 == 0){
+        output_data.x = val;
+      }else if (c_out % 4 == 1){
+        output_data.y = val;
+      }else if (c_out % 4 == 2){
+        output_data.z = val;
+      }else if (c_out % 4 == 3){
+        output_data.w = val;
+      }
+      WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, output_data);
+    }
+  }else if (flag == 2){ // by height, width == n
+    int2 output_pos;
+    output_pos.x = in_c * in_W + in_w;
+    output_pos.y = in_nh + C_0 * width;
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
+  }else if (flag == 3){ // by width, width == C
+    int2 output_pos;
+    output_pos.y = in_nh;
+    output_pos.x = in_c * out_W + (in_w + C_0);
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, input_data);
   }
-  
 }
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
similarity index 64%
rename from lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
rename to lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
index 37e03e802c56d3de9ba08e97c9dfb62f8cd76e9a..4b2d5ba32072e7eb31adbf347360e0bbcee7bc5b 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
@@ -1,21 +1,21 @@
 #include <cl_common.h>
 
-__kernel void conv2d_1x1(__private const int global_size_dim0,
+__kernel void conv2d_1x1_opt(__private const int global_size_dim0,
                          __private const int global_size_dim1,
                          __private const int global_size_dim2,
                          __read_only image2d_t input_image,
                          __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
-                         __read_only image2d_t bias,
+    __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
-                         __read_only image2d_t new_scale,
+__read_only image2d_t new_scale,
                          __read_only image2d_t new_biase,
 #endif
                          __write_only image2d_t output_image,
                          __private const int stride,
                          __private const int offset,
-                         __private const int input_c,
+                         __private const int input_c_block,
                          __private const int input_c_origin,
                          __private const int dilation,
                          __private const int input_width,  /* of one block */
@@ -23,7 +23,7 @@ __kernel void conv2d_1x1(__private const int global_size_dim0,
                          __private const int output_width,
                          __private const int output_height,
                          __private const int old_w) {
-  CL_DTYPE zero = 0.0f;
+
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
   const int out_nh = get_global_id(2);
@@ -79,14 +79,9 @@ __kernel void conv2d_1x1(__private const int global_size_dim0,
   CL_DTYPE4 output3 = 0.0f;
 #endif
 
-  int max_w_bound = input_c * input_width;
-  int burndary_index = input_c * 4 - input_c_origin;
-  bool burndary_index_w =
-      burndary_index == 1 || burndary_index == 2 || burndary_index == 3;
-  bool burndary_index_z = burndary_index == 2 || burndary_index == 3;
-  bool burndary_index_y = burndary_index == 3;
-
-  for (int i = 0; i < input_c; ++i) {
+  int max_w_bound = input_c_block * input_width;
+  int burndary_index = input_c_block * 4 - input_c_origin;
+  for (int i = 0; i < input_c_block; ++i) {
     // ------------0---------------
     int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
                          in_pos_in_one_block0.y);
@@ -101,34 +96,73 @@ __kernel void conv2d_1x1(__private const int global_size_dim0,
         READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 2));
     CL_DTYPE4 weight3 =
         READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 3));
-    int bound_gap = max_w_bound - pos_in.x - 1;
 
-    bool outof_bound = bound_gap < input_width && bound_gap >= 0;
-    input0.w = select(input0.w, zero, outof_bound && burndary_index_w);
-    input0.z = select(input0.z, zero, outof_bound && burndary_index_z);
-    input0.y = select(input0.y, zero, outof_bound && burndary_index_y);
+    if ((max_w_bound - pos_in.x - 1) < input_width &&
+        (max_w_bound - pos_in.x - 1) >= 0) {
+      if (burndary_index == 0) {
+        output0 = mad(input0.x, weight0, output0);
+        output0 = mad(input0.y, weight1, output0);
+        output0 = mad(input0.z, weight2, output0);
+        output0 = mad(input0.w, weight3, output0);
+      } else if (burndary_index == 1) {
+        output0 = mad(input0.x, weight0, output0);
+        output0 = mad(input0.y, weight1, output0);
+        output0 = mad(input0.z, weight2, output0);
+        output0 = mad(0.0f, weight3, output0);
+
+      } else if (burndary_index == 2) {
+        output0 = mad(input0.x, weight0, output0);
+        output0 = mad(input0.y, weight1, output0);
+        output0 = mad(0.0f, weight2, output0);
+        output0 = mad(0.0f, weight3, output0);
+      } else if (burndary_index == 3) {
+        output0 = mad(input0.x, weight0, output0);
+        output0 = mad(0.0f, weight1, output0);
+        output0 = mad(0.0f, weight2, output0);
+        output0 = mad(0.0f, weight3, output0);
+      }
+    } else {
+      output0 = mad(input0.x, weight0, output0);
+      output0 = mad(input0.y, weight1, output0);
+      output0 = mad(input0.z, weight2, output0);
+      output0 = mad(input0.w, weight3, output0);
+    }
 
-    output0 = mad(input0.x, weight0, output0);
-    output0 = mad(input0.y, weight1, output0);
-    output0 = mad(input0.z, weight2, output0);
-    output0 = mad(input0.w, weight3, output0);
     // -------------1--------------
     pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
                     in_pos_in_one_block1.y);
     CL_DTYPE4 input1 =
         READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
 
-    bound_gap = max_w_bound - pos_in.x - 1;
-
-    outof_bound = bound_gap < input_width && bound_gap >= 0;
-    input1.w = select(input1.w, zero, outof_bound && burndary_index_w);
-    input1.z = select(input1.z, zero, outof_bound && burndary_index_z);
-    input1.y = select(input1.y, zero, outof_bound && burndary_index_y);
-
-    output1 = mad(input1.x, weight0, output1);
-    output1 = mad(input1.y, weight1, output1);
-    output1 = mad(input1.z, weight2, output1);
-    output1 = mad(input1.w, weight3, output1);
+    if (abs(max_w_bound - pos_in.x) < input_width) {
+      if (burndary_index == 0) {
+        output1 = mad(input1.x, weight0, output1);
+        output1 = mad(input1.y, weight1, output1);
+        output1 = mad(input1.z, weight2, output1);
+        output1 = mad(input1.w, weight3, output1);
+      } else if (burndary_index == 1) {
+        output1 = mad(input1.x, weight0, output1);
+        output1 = mad(input1.y, weight1, output1);
+        output1 = mad(input1.z, weight2, output1);
+        output1 = mad(0.0f, weight3, output1);
+
+      } else if (burndary_index == 2) {
+        output1 = mad(input1.x, weight0, output1);
+        output1 = mad(input1.y, weight1, output1);
+        output1 = mad(0.0f, weight2, output1);
+        output1 = mad(0.0f, weight3, output1);
+      } else if (burndary_index == 3) {
+        output1 = mad(input1.x, weight0, output1);
+        output1 = mad(0.0f, weight1, output1);
+        output1 = mad(0.0f, weight2, output1);
+        output1 = mad(0.0f, weight3, output1);
+      }
+    } else {
+      output1 = mad(input1.x, weight0, output1);
+      output1 = mad(input1.y, weight1, output1);
+      output1 = mad(input1.z, weight2, output1);
+      output1 = mad(input1.w, weight3, output1);
+    }
 
     // -------------2--------------
     pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
@@ -136,41 +170,71 @@ __kernel void conv2d_1x1(__private const int global_size_dim0,
     CL_DTYPE4 input2 =
         READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
 
-    bound_gap = max_w_bound - pos_in.x - 1;
-
-    outof_bound = bound_gap < input_width && bound_gap >= 0;
-    input2.w = select(input2.w, zero, outof_bound && burndary_index_w);
-    input2.z = select(input2.z, zero, outof_bound && burndary_index_z);
-    input2.y = select(input2.y, zero, outof_bound && burndary_index_y);
-
-    output2 = mad(input2.x, weight0, output2);
-    output2 = mad(input2.y, weight1, output2);
-    output2 = mad(input2.z, weight2, output2);
-    output2 = mad(input2.w, weight3, output2);
+    if (abs(max_w_bound - pos_in.x) < input_width) {
+      if (burndary_index == 0) {
+        output2 = mad(input2.x, weight0, output2);
+        output2 = mad(input2.y, weight1, output2);
+        output2 = mad(input2.z, weight2, output2);
+        output2 = mad(input2.w, weight3, output2);
+      } else if (burndary_index == 1) {
+        output2 = mad(input2.x, weight0, output2);
+        output2 = mad(input2.y, weight1, output2);
+        output2 = mad(input2.z, weight2, output2);
+        output2 = mad(0.0f, weight3, output2);
+
+      } else if (burndary_index == 2) {
+        output2 = mad(input2.x, weight0, output2);
+        output2 = mad(input2.y, weight1, output2);
+        output2 = mad(0.0f, weight2, output2);
+        output2 = mad(0.0f, weight3, output2);
+      } else if (burndary_index == 3) {
+        output2 = mad(input2.x, weight0, output2);
+        output2 = mad(0.0f, weight1, output2);
+        output2 = mad(0.0f, weight2, output2);
+        output2 = mad(0.0f, weight3, output2);
+      }
+    } else {
+      output2 = mad(input2.x, weight0, output2);
+      output2 = mad(input2.y, weight1, output2);
+      output2 = mad(input2.z, weight2, output2);
+      output2 = mad(input2.w, weight3, output2);
+    }
 
     // -------------3--------------
     pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
                     in_pos_in_one_block3.y);
     CL_DTYPE4 input3 =
         READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
-    bound_gap = max_w_bound - pos_in.x - 1;
-
-    outof_bound = bound_gap < input_width && bound_gap >= 0;
-    input3.w =
-        select(input3.w,
-               zero,
-               outof_bound && (burndary_index == 1 || burndary_index == 2 ||
-                               burndary_index == 3));
-    input3.z =
-        select(input3.z,
-               zero,
-               outof_bound && (burndary_index == 2 || burndary_index == 3));
-    input3.y = select(input3.y, zero, outof_bound && burndary_index == 3);
 
-    output3 = mad(input3.x, weight0, output3);
-    output3 = mad(input3.y, weight1, output3);
-    output3 = mad(input3.z, weight2, output3);
-    output3 = mad(input3.w, weight3, output3);
+    if (abs(max_w_bound - pos_in.x) < input_width) {
+      if (burndary_index == 0) {
+        output3 = mad(input3.x, weight0, output3);
+        output3 = mad(input3.y, weight1, output3);
+        output3 = mad(input3.z, weight2, output3);
+        output3 = mad(input3.w, weight3, output3);
+      } else if (burndary_index == 1) {
+        output3 = mad(input3.x, weight0, output3);
+        output3 = mad(input3.y, weight1, output3);
+        output3 = mad(input3.z, weight2, output3);
+        output3 = mad(0.0f, weight3, output3);
+
+      } else if (burndary_index == 2) {
+        output3 = mad(input3.x, weight0, output3);
+        output3 = mad(input3.y, weight1, output3);
+        output3 = mad(0.0f, weight2, output3);
+        output3 = mad(0.0f, weight3, output3);
+      } else if (burndary_index == 3) {
+        output3 = mad(input3.x, weight0, output3);
+        output3 = mad(0.0f, weight1, output3);
+        output3 = mad(0.0f, weight2, output3);
+        output3 = mad(0.0f, weight3, output3);
+      }
+    } else {
+      output3 = mad(input3.x, weight0, output3);
+      output3 = mad(input3.y, weight1, output3);
+      output3 = mad(input3.z, weight2, output3);
+      output3 = mad(input3.w, weight3, output3);
+    }
   }
 
 #ifdef BATCH_NORM
@@ -191,12 +255,10 @@ __kernel void conv2d_1x1(__private const int global_size_dim0,
             READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
 #endif
 
-#ifdef RELU
   output0 = activation_type4(output0);
   output1 = activation_type4(output1);
   output2 = activation_type4(output2);
   output3 = activation_type4(output3);
-#endif
 
   if (out_w0 < old_w) {
     WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
@@ -215,29 +277,30 @@ __kernel void conv2d_1x1(__private const int global_size_dim0,
   }
 }
 
-__kernel void conv2d_1x1_simple(__private const int global_size_dim0,
-                         __private const int global_size_dim1,
-                         __private const int global_size_dim2,
-                         __read_only image2d_t input_image,
-                         __read_only image2d_t filter,
+__kernel void conv2d_1x1_simple(
+    __private const int global_size_dim0,
+    __private const int global_size_dim1,
+    __private const int global_size_dim2,
+    __read_only image2d_t input_image,
+    __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
     __read_only image2d_t bias,
 #endif
 #ifdef BATCH_NORM
 __read_only image2d_t new_scale,
-                         __read_only image2d_t new_biase,
+    __read_only image2d_t new_biase,
 #endif
-                         __write_only image2d_t output_image,
-                         __private const int stride,
-                         __private const int offset,
-                         __private const int input_c,
-                         __private const int input_c_origin,
-                         __private const int dilation,
-                         __private const int input_width,  /* of one block */
-                         __private const int input_height, /* of one block */
-                         __private const int output_width,
-                         __private const int output_height,
-                         __private const int old_w) {
+    __write_only image2d_t output_image,
+    __private const int stride,
+    __private const int offset,
+    __private const int input_c,
+    __private const int input_c_origin,
+    __private const int dilation,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width,
+    __private const int output_height,
+    __private const int old_w) {
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
   const int out_nh = get_global_id(2);
@@ -360,13 +423,11 @@ __read_only image2d_t new_scale,
             READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
 #endif
 
-
   output0 = activation_type4(output0);
   output1 = activation_type4(output1);
   output2 = activation_type4(output2);
   output3 = activation_type4(output3);
 
-
   if (out_w0 < old_w) {
     WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
   }
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..79f3922e89549fc15b7a849efb0e2b6595357102
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
@@ -0,0 +1,505 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void conv2d_3x3_opt(__private const int item_ch,
+                             __private const int item_w,
+                             __private const int item_h,
+                             __read_only image2d_t input_image,
+                             __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                             __read_only image2d_t bias,
+#endif
+                             __write_only image2d_t output_image,
+                             __private const int stride,
+                             __private const int pad,
+                             __private const int dilation,
+                             __private const int batch,
+                             __private const int in_ch,
+                             __private const int in_w,
+                             __private const int in_h,
+                             __private const int out_w,
+                             __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_batch_id = item_h_id / in_h;
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
+
+#ifdef BIASE_CH
+
+  CL_DTYPE4 output[5];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
+
+#elif defined(BIASE_ELE)
+
+  CL_DTYPE4 output[5];
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
+#else
+  CL_DTYPE4 output[5] = {0.0f};
+#endif
+
+  CL_DTYPE4 filter[4] = {0.0f};
+  CL_DTYPE4 filter_trans[4] = {0.0f};
+  CL_DTYPE4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * 3;
+  int filter_h_val1 = filter_h_val0 + 3;
+  int filter_h_val2 = filter_h_val1 + 3;
+  int filter_h_val3 = filter_h_val2 + 3;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * 3;
+
+    for (int h = 0; h < 3; h++) {
+      int in_h_val = select(out_batch_id * in_h + in_h_id + h,
+                            -1,
+                            (out_batch_id * in_h + in_h_id + h < 0 ||
+                             out_batch_id * in_h + in_h_id + h >= in_h));
+
+      for (int w = 0; w < 3; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
+    }
+  }
+
+  output[0] = activation_type4(output[0]);
+  output[1] = activation_type4(output[1]);
+  output[2] = activation_type4(output[2]);
+  output[3] = activation_type4(output[3]);
+  output[4] = activation_type4(output[4]);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
+  if (out_w_id1 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
+  }
+}
+
+// support batch > 1
+__kernel void conv2d_3x3_multi_batch(__private const int item_ch,
+                                     __private const int item_w,
+                                     __private const int item_h,
+                                     __read_only image2d_t input_image,
+                                     __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                                     __read_only image2d_t bias,
+#endif
+                                     __write_only image2d_t output_image,
+                                     __private const int stride,
+                                     __private const int pad,
+                                     __private const int dilation,
+                                     __private const int batch,
+                                     __private const int in_ch,
+                                     __private const int in_w,
+                                     __private const int in_h,
+                                     __private const int out_w,
+                                     __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_batch_id = item_h_id / in_h;
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
+
+#ifdef BIASE_CH
+
+  CL_DTYPE4 output[5];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
+
+#elif defined(BIASE_ELE)
+
+  CL_DTYPE4 output[5];
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
+#else
+  CL_DTYPE4 output[5] = {0.0f};
+#endif
+
+  CL_DTYPE4 filter[4] = {0.0f};
+  CL_DTYPE4 filter_trans[4] = {0.0f};
+  CL_DTYPE4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * 3;
+  int filter_h_val1 = filter_h_val0 + 3;
+  int filter_h_val2 = filter_h_val1 + 3;
+  int filter_h_val3 = filter_h_val2 + 3;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * 3;
+
+    for (int h = 0; h < 3; h++) {
+      int in_h_val = select(
+          out_batch_id * in_h + in_h_id + h,
+          -1,
+          (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h ||
+           out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h));
+
+      for (int w = 0; w < 3; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR,
+            filter_image,
+            sampler,
+            (int2)(filter_w_val + w, filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
+    }
+  }
+
+  output[0] = activation_type4(output[0]);
+  output[1] = activation_type4(output[1]);
+  output[2] = activation_type4(output[2]);
+  output[3] = activation_type4(output[3]);
+  output[4] = activation_type4(output[4]);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
+  if (out_w_id1 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
+  }
+}
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..4ed2e072022dc4b457a86d634bf4bc21ab62bc45
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
@@ -0,0 +1,516 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+// opt version of conv5x5
+__kernel void conv2d_5x5_opt(__private const int item_ch,
+                             __private const int item_w,
+                             __private const int item_h,
+                             __read_only image2d_t input_image,
+                             __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                             __read_only image2d_t bias,
+#endif
+                             __write_only image2d_t output_image,
+                             __private const int stride,
+                             __private const int pad,
+                             __private const int dilation,
+                             __private const int batch,
+                             __private const int in_ch,
+                             __private const int in_w,
+                             __private const int in_h,
+                             __private const int out_w,
+                             __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  // filter
+  const int filter_w = 5;
+  const int filter_h = 5;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
+
+#ifdef BIASE_CH
+
+  CL_DTYPE4 output[5];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
+
+#elif defined(BIASE_ELE)
+
+  CL_DTYPE4 output[5];
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
+#else
+  CL_DTYPE4 output[5] = {0.0f};
+#endif
+
+  CL_DTYPE4 filter[4] = {0.0f};
+  CL_DTYPE4 filter_trans[4] = {0.0f};
+  CL_DTYPE4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * filter_h;
+  int filter_h_val1 = filter_h_val0 + filter_h;
+  int filter_h_val2 = filter_h_val1 + filter_h;
+  int filter_h_val3 = filter_h_val2 + filter_h;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * filter_w;
+
+    for (int h = 0; h < filter_h; h++) {
+      int in_h_val =
+          select(in_h_id + h, -1, (in_h_id + h < 0 || in_h_id + h >= in_h));
+
+      for (int w = 0; w < filter_w; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
+    }
+  }
+
+  output[0] = activation_type4(output[0]);
+  output[1] = activation_type4(output[1]);
+  output[2] = activation_type4(output[2]);
+  output[3] = activation_type4(output[3]);
+  output[4] = activation_type4(output[4]);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
+  if (out_w_id1 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
+  }
+}
+// support batch > 1
+__kernel void conv2d_5x5_multi_batch(__private const int item_ch,
+                                     __private const int item_w,
+                                     __private const int item_h,
+                                     __read_only image2d_t input_image,
+                                     __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                                     __read_only image2d_t bias,
+#endif
+                                     __write_only image2d_t output_image,
+                                     __private const int stride,
+                                     __private const int pad,
+                                     __private const int dilation,
+                                     __private const int batch,
+                                     __private const int in_ch,
+                                     __private const int in_w,
+                                     __private const int in_h,
+                                     __private const int out_w,
+                                     __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  // filter
+  const int filter_w = 5;
+  const int filter_h = 5;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_batch_id = item_h_id / in_h;
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
+
+#ifdef BIASE_CH
+
+  CL_DTYPE4 output[5];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
+
+#elif defined(BIASE_ELE)
+
+  CL_DTYPE4 output[5];
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
+#else
+  CL_DTYPE4 output[5] = {0.0f};
+#endif
+
+  CL_DTYPE4 filter[4] = {0.0f};
+  CL_DTYPE4 filter_trans[4] = {0.0f};
+  CL_DTYPE4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * filter_h;
+  int filter_h_val1 = filter_h_val0 + filter_h;
+  int filter_h_val2 = filter_h_val1 + filter_h;
+  int filter_h_val3 = filter_h_val2 + filter_h;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * filter_w;
+
+    for (int h = 0; h < filter_h; h++) {
+      int in_h_val = select(
+          out_batch_id * in_h + in_h_id + h,
+          -1,
+          (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h ||
+           out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h));
+
+      for (int w = 0; w < filter_w; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
+    }
+  }
+
+  output[0] = activation_type4(output[0]);
+  output[1] = activation_type4(output[1]);
+  output[2] = activation_type4(output[2]);
+  output[3] = activation_type4(output[3]);
+  output[4] = activation_type4(output[4]);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
+  if (out_w_id1 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
+  }
+}
\ No newline at end of file
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
index 1f99322812c13287af92b52aee6c346309ee006c..4998dc99279fffad8750ef3b6495597e9fc4ad65 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
@@ -36,10 +36,10 @@ __kernel void conv2d_7x7(__private const int global_size_dim0,
   const int batch_index = out_nh / output_height;
   const int out_nh_in_one_batch = out_nh % output_height;
 
-  const filter_n0 = 4 * out_c + 0;
-  const filter_n1 = 4 * out_c + 1;
-  const filter_n2 = 4 * out_c + 2;
-  const filter_n3 = 4 * out_c + 3;
+  const int filter_n0 = 4 * out_c + 0;
+  const int filter_n1 = 4 * out_c + 1;
+  const int filter_n2 = 4 * out_c + 2;
+  const int filter_n3 = 4 * out_c + 3;
 
   int2 stride_xy;
   stride_xy.x = stride;
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..d82f4b4c96b586b6ecf948827402afd0766dcea4
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
@@ -0,0 +1,516 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+// opt version of con7x7
+__kernel void conv2d_7x7_opt(__private const int item_ch,
+                             __private const int item_w,
+                             __private const int item_h,
+                             __read_only image2d_t input_image,
+                             __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                             __read_only image2d_t bias,
+#endif
+                             __write_only image2d_t output_image,
+                             __private const int stride,
+                             __private const int pad,
+                             __private const int dilation,
+                             __private const int batch,
+                             __private const int in_ch,
+                             __private const int in_w,
+                             __private const int in_h,
+                             __private const int out_w,
+                             __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  // filter
+  const int filter_w = 7;
+  const int filter_h = 7;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
+
+#ifdef BIASE_CH
+
+  CL_DTYPE4 output[5];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
+
+#elif defined(BIASE_ELE)
+
+  CL_DTYPE4 output[5];
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
+#else
+  CL_DTYPE4 output[5] = {0.0f};
+#endif
+
+  CL_DTYPE4 filter[4] = {0.0f};
+  CL_DTYPE4 filter_trans[4] = {0.0f};
+  CL_DTYPE4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * filter_h;
+  int filter_h_val1 = filter_h_val0 + filter_h;
+  int filter_h_val2 = filter_h_val1 + filter_h;
+  int filter_h_val3 = filter_h_val2 + filter_h;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * filter_w;
+
+    for (int h = 0; h < filter_h; h++) {
+      int in_h_val =
+          select(in_h_id + h, -1, (in_h_id + h < 0 || in_h_id + h >= in_h));
+
+      for (int w = 0; w < filter_w; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
+    }
+  }
+
+  output[0] = activation_type4(output[0]);
+  output[1] = activation_type4(output[1]);
+  output[2] = activation_type4(output[2]);
+  output[3] = activation_type4(output[3]);
+  output[4] = activation_type4(output[4]);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
+  if (out_w_id1 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
+  }
+}
+// support batch > 1
+__kernel void conv2d_7x7_multi_batch(__private const int item_ch,
+                                     __private const int item_w,
+                                     __private const int item_h,
+                                     __read_only image2d_t input_image,
+                                     __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                                     __read_only image2d_t bias,
+#endif
+                                     __write_only image2d_t output_image,
+                                     __private const int stride,
+                                     __private const int pad,
+                                     __private const int dilation,
+                                     __private const int batch,
+                                     __private const int in_ch,
+                                     __private const int in_w,
+                                     __private const int in_h,
+                                     __private const int out_w,
+                                     __private const int out_h) {
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  // filter
+  const int filter_w = 7;
+  const int filter_h = 7;
+
+  // item_id
+  const int item_ch_id = get_global_id(0);
+  const int item_w_id = get_global_id(1);
+  const int item_h_id = get_global_id(2);
+
+  // out_width_id_per_blk and out_batch_id
+  int out_batch_id = item_h_id / in_h;
+  int out_w_base_id = item_ch_id * out_w;
+  int out_w_id0 = item_w_id;
+  int out_w_id1 = out_w_id0 + item_w;
+  int out_w_id2 = out_w_id1 + item_w;
+  int out_w_id3 = out_w_id2 + item_w;
+  int out_w_id4 = out_w_id3 + item_w;
+
+  // in_width_id_per_blk and in_height_id_per_batch
+  int in_h_id = (item_h_id % out_h) * stride - pad;
+  int in_w_id0 = item_w_id * stride - pad;
+  int in_w_id1 = in_w_id0 + item_w * stride;
+  int in_w_id2 = in_w_id1 + item_w * stride;
+  int in_w_id3 = in_w_id2 + item_w * stride;
+  int in_w_id4 = in_w_id3 + item_w * stride;
+
+#ifdef BIASE_CH
+
+  CL_DTYPE4 output[5];
+  output[0] =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(item_ch_id, 0));
+  output[1] = output[0];
+  output[2] = output[0];
+  output[3] = output[0];
+  output[4] = output[0];
+
+#elif defined(BIASE_ELE)
+
+  CL_DTYPE4 output[5];
+  output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                            bias,
+                            sampler,
+                            (int2)(out_w_base_id + out_w_id0, item_h_id));
+  if (out_w_id1 < out_w) {
+    output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id1, item_h_id));
+  }
+  if (out_w_id2 < out_w) {
+    output[2] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id2, item_h_id));
+  }
+  if (out_w_id3 < out_w) {
+    output[3] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id3, item_h_id));
+  }
+  if (out_w_id4 < out_w) {
+    output[4] = READ_IMG_TYPE(CL_DTYPE_CHAR,
+                              bias,
+                              sampler,
+                              (int2)(out_w_base_id + out_w_id4, item_h_id));
+  }
+#else
+  CL_DTYPE4 output[5] = {0.0f};
+#endif
+
+  CL_DTYPE4 filter[4] = {0.0f};
+  CL_DTYPE4 filter_trans[4] = {0.0f};
+  CL_DTYPE4 input[5] = {0.0f};
+
+  int filter_h_val0 = item_ch_id * 4 * filter_h;
+  int filter_h_val1 = filter_h_val0 + filter_h;
+  int filter_h_val2 = filter_h_val1 + filter_h;
+  int filter_h_val3 = filter_h_val2 + filter_h;
+
+  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
+    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
+
+    const int in_w_base_id = mul24(ch, in_w);
+
+    int filter_w_val = ch * filter_w;
+
+    for (int h = 0; h < filter_h; h++) {
+      int in_h_val = select(
+          out_batch_id * in_h + in_h_id + h,
+          -1,
+          (out_batch_id * in_h + in_h_id + h < out_batch_id * in_h ||
+           out_batch_id * in_h + in_h_id + h >= (out_batch_id + 1) * in_h));
+
+      for (int w = 0; w < filter_w; w++) {
+        int in_w_val0 = select(in_w_base_id + in_w_id0 + w,
+                               -1,
+                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
+        int in_w_val1 = select(in_w_base_id + in_w_id1 + w,
+                               -1,
+                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
+        int in_w_val2 = select(in_w_base_id + in_w_id2 + w,
+                               -1,
+                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
+        int in_w_val3 = select(in_w_base_id + in_w_id3 + w,
+                               -1,
+                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
+        int in_w_val4 = select(in_w_base_id + in_w_id4 + w,
+                               -1,
+                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
+
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val0 + h));  // in_ch:0-3,out_ch:0
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val1 + h));  // in_ch:0-3,out_ch:1
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val2 + h));  // in_ch:0-3,out_ch:2
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          filter_image,
+                          sampler,
+                          (int2)(filter_w_val + w,
+                                 filter_h_val3 + h));  // in_ch:0-3,out_ch:3
+
+        filter_trans[0] = (CL_DTYPE4)(filter[0].x,
+                                      filter[1].x,
+                                      filter[2].x,
+                                      filter[3].x);  // in_ch:0,out_ch:0-3
+        filter_trans[1] = (CL_DTYPE4)(filter[0].y,
+                                      filter[1].y,
+                                      filter[2].y,
+                                      filter[3].y);  // in_ch:1,out_ch:0-3
+        filter_trans[2] = (CL_DTYPE4)(filter[0].z,
+                                      filter[1].z,
+                                      filter[2].z,
+                                      filter[3].z);  // in_ch:2,out_ch:0-3
+        filter_trans[3] = (CL_DTYPE4)(filter[0].w,
+                                      filter[1].w,
+                                      filter[2].w,
+                                      filter[3].w);  // in_ch:3,out_ch:0-3
+
+        input[0] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val0, in_h_val));
+        input[1] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val1, in_h_val));
+        input[2] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val2, in_h_val));
+        input[3] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val3, in_h_val));
+        input[4] = READ_IMG_TYPE(
+            CL_DTYPE_CHAR, input_image, sampler, (int2)(in_w_val4, in_h_val));
+
+        output[0] = mad(input[0].x, filter_trans[0], output[0]);
+        output[1] = mad(input[1].x, filter_trans[0], output[1]);
+        output[2] = mad(input[2].x, filter_trans[0], output[2]);
+        output[3] = mad(input[3].x, filter_trans[0], output[3]);
+        output[4] = mad(input[4].x, filter_trans[0], output[4]);
+
+        if (ch_surplus < 3) {
+          output[0] = mad(input[0].y, filter_trans[1], output[0]);
+          output[1] = mad(input[1].y, filter_trans[1], output[1]);
+          output[2] = mad(input[2].y, filter_trans[1], output[2]);
+          output[3] = mad(input[3].y, filter_trans[1], output[3]);
+          output[4] = mad(input[4].y, filter_trans[1], output[4]);
+        }
+        if (ch_surplus < 2) {
+          output[0] = mad(input[0].z, filter_trans[2], output[0]);
+          output[1] = mad(input[1].z, filter_trans[2], output[1]);
+          output[2] = mad(input[2].z, filter_trans[2], output[2]);
+          output[3] = mad(input[3].z, filter_trans[2], output[3]);
+          output[4] = mad(input[4].z, filter_trans[2], output[4]);
+        }
+        if (ch_surplus < 1) {
+          output[0] = mad(input[0].w, filter_trans[3], output[0]);
+          output[1] = mad(input[1].w, filter_trans[3], output[1]);
+          output[2] = mad(input[2].w, filter_trans[3], output[2]);
+          output[3] = mad(input[3].w, filter_trans[3], output[3]);
+          output[4] = mad(input[4].w, filter_trans[3], output[4]);
+        }
+      }
+    }
+  }
+
+  output[0] = activation_type4(output[0]);
+  output[1] = activation_type4(output[1]);
+  output[2] = activation_type4(output[2]);
+  output[3] = activation_type4(output[3]);
+  output[4] = activation_type4(output[4]);
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                 output_image,
+                 (int2)(out_w_base_id + out_w_id0, item_h_id),
+                 output[0]);
+  if (out_w_id1 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id1, item_h_id),
+                   output[1]);
+  }
+  if (out_w_id2 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id2, item_h_id),
+                   output[2]);
+  }
+  if (out_w_id3 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id3, item_h_id),
+                   output[3]);
+  }
+  if (out_w_id4 < out_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR,
+                   output_image,
+                   (int2)(out_w_base_id + out_w_id4, item_h_id),
+                   output[4]);
+  }
+}
\ No newline at end of file
diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
index 14086dcd16bd1a8770f444bdcd0b6bea78e23b7e..6ab2b59343f09c1284ec21a7913f67c26707301c 100755
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
@@ -22,10 +22,6 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
                                               __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
                                               __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
 #endif
                                               __write_only image2d_t output_image,
                                               __private const int stride,
@@ -137,13 +133,8 @@ __kernel void depth_conv2d_3x3(__private const int global_size_dim0,
     for(int i = 0 ;i < 9 ; i++){
      output += inputs[i] * filters[i];
     }
-#ifdef BATCH_NORM
-    output = output * READ_IMG_TYPE(CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) + READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
-#endif
 
-#ifdef RELU
     output = activation_type4(output);
-#endif
 
 
     /*
@@ -179,10 +170,6 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
                                               __read_only image2d_t filter,
 #if defined(BIASE_CH) || defined(BIASE_ELE)
                                               __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-                                              __read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
 #endif
                                               __write_only image2d_t output_image,
                                               __private const int stride,
@@ -299,19 +286,9 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
 
     output[0] = mad(inputs[10], filters[8], output[0]);
     output[1] = mad(inputs[11], filters[8], output[1]);
-#ifdef BATCH_NORM
-    CL_DTYPE4 scale = READ_IMG_TYPE(CL_DTYPE_CHAR, new_scale, sampler, (int2)(ou_ch_blk_id, 0));
-    CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(ou_ch_blk_id, 0));
-    output[0] = mad(scale, output[0], biase);
-    if (ou_col_id + 1 < ou_w) {
-        output[1] = mad(scale, output[1], biase);
-    }
-#endif
 
-#ifdef RELU
     output[0] = activation_type4(output[0]);
     output[1] = activation_type4(output[1]);
-#endif
 
     WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]);
     if (ou_col_id + 1 < ou_w) {
diff --git a/lite/backends/opencl/cl_kernel/image/dropout_kernel.cl b/lite/backends/opencl/cl_kernel/image/dropout_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..116b4452dd17e800da20238ad688daf5630d55fb
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/dropout_kernel.cl
@@ -0,0 +1,43 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void dropout(__read_only image2d_t input_image,
+                      __write_only image2d_t output_image,
+                      __private const int out_W,
+                      __private const float dropoutPro) {
+
+                       const int out_c = get_global_id(0);
+                       const int out_w = get_global_id(1);
+                       const int out_nh = get_global_id(2);
+
+                       int2 output_pos;
+                       output_pos.x = out_c * out_W + out_w;
+                       output_pos.y = out_nh;
+
+                       const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                 CLK_ADDRESS_CLAMP      |
+                                                 CLK_FILTER_NEAREST;
+                       half4 input;
+                       half4 output;
+
+                       input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler,output_pos);
+                       half4 dropout = (half4)(1 - dropoutPro);
+                       output =  dropout * input;
+
+                       write_imageh(output_image, output_pos, output);
+}
+
+
diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl
index 17b6e8c72a82718a541841ff3c69c175649d7056..73a089d7591b98486daac2d4aaa29fe4f2192134 100644
--- a/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl
@@ -14,7 +14,8 @@ limitations under the License. */
 
 #include <cl_common.h>
 
-__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,
+__kernel void elementwise_mul(__global image2d_t input,
+                              __global image2d_t bias,
                               __write_only image2d_t outputImage) {
   int x = get_global_id(0);
   int y = get_global_id(1);
@@ -29,8 +30,148 @@ __kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,
   WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
 }
 
-__kernel void channel_mul_d1(__read_only image2d_t input, __read_only image2d_t bias,
-                             __write_only image2d_t outputImage, int w) {
+__kernel void channel_mul(__global image2d_t input,
+                          __global image2d_t bias,
+                          __write_only image2d_t outputImage,
+                          int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias;
+  coords_bias.x = x / w;
+  coords_bias.y = 0;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+  CL_DTYPE4 output = in * biase;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+
+// etc : 1 1 1 72
+// run time Y  [value,0,0,0] * 72
+__kernel void channel_mul_d2(__global image2d_t input,
+                             __global image2d_t bias,
+                             __write_only image2d_t outputImage,
+                             int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias0;
+  int2 coords_bias1;
+  int2 coords_bias2;
+  int2 coords_bias3;
+  /*  if (x == 0 && y == 0) {
+      CL_DTYPE4 b = (CL_DTYPE4){0, 0, 0, 0};
+  #define PPI(j, k)                                                          \
+    b = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2){j, k});                            \
+    printf("bias(%d,%d)={ %f , %f , %f , %f }\n ", j, k, convert_float(b.x), \
+           convert_float(b.y), convert_float(b.z), convert_float(b.w));
+      for (int i = 0; i < 73; ++i) {
+        PPI(i, 0);
+      }
+  #undef PPI
+    }*/
+  coords_bias0.x = x / w * 4;
+  coords_bias0.y = 0;
+  coords_bias1.x = x / w * 4 + 1;
+  coords_bias1.y = 0;
+  coords_bias2.x = x / w * 4 + 2;
+  coords_bias2.y = 0;
+  coords_bias3.x = x / w * 4 + 3;
+  coords_bias3.y = 0;
+  CL_DTYPE4 biase0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias0);
+  CL_DTYPE4 biase1 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias1);
+  CL_DTYPE4 biase2 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias2);
+  CL_DTYPE4 biase3 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias3);
+  /*  if (x == 0 && y == 0) {
+      printf("bias0={ %f , %f , %f , %f }\n ",
+             convert_float(biase0.x), convert_float(biase0.y),
+             convert_float(biase0.z), convert_float(biase0.w));
+      printf("bias1={ %f , %f , %f , %f }\n ",
+             convert_float(biase1.x), convert_float(biase1.y),
+             convert_float(biase1.z), convert_float(biase1.w));
+      printf("bias2={ %f , %f , %f , %f }\n ",
+             convert_float(biase2.x), convert_float(biase2.y),
+             convert_float(biase2.z), convert_float(biase2.w));
+      printf("bias3={ %f , %f , %f , %f }\n ",
+             convert_float(biase3.x), convert_float(biase3.y),
+             convert_float(biase3.z), convert_float(biase3.w));
+    }*/
+  CL_DTYPE4 biase = {biase0.x, biase1.x, biase2.x, biase3.x};
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 output = mad(in, biase, 0);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+
+// c 1 1
+__kernel void channel_mul_d3(__global image2d_t input,
+                             __global image2d_t bias,
+                             __write_only image2d_t outputImage,
+                             int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias;
+  coords_bias.x = x / w;
+  coords_bias.y = 0;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+  CL_DTYPE4 output = in * biase;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+
+__kernel void channel_mul_d4(__global image2d_t input,
+__global image2d_t bias,
+                          __write_only image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias;
+  coords_bias.x = x / w;
+  coords_bias.y = 0;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+  CL_DTYPE4 output = in * biase;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+
+#if 0 // TODO(ysh329): comment code below
+__kernel void elementwise_mul(__global image2d_t input,
+                              __global image2d_t bias,
+                              __write_only image2d_t outputImage) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords);
+  CL_DTYPE4 output = in * biase;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+
+
+__kernel void channel_mul_d1(__read_only image2d_t input,
+                             __read_only image2d_t bias,
+                             __write_only image2d_t outputImage,
+							 int w) {
   int x = get_global_id(0);
   int y = get_global_id(1);
 
@@ -52,8 +193,88 @@ __kernel void channel_mul_d1(__read_only image2d_t input, __read_only image2d_t
   WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
 }
 
-__kernel void channel_mul_d2(__read_only image2d_t input, __read_only image2d_t bias,
-                             __write_only image2d_t outputImage, int w, int h) {
+
+// #define DEBUG
+__kernel void channel_mul_d2_nc(__read_only image2d_t input,
+                                __read_only image2d_t bias,
+                                __write_only image2d_t outputImage,
+	   						    int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+
+#ifdef DEBUG
+  printf("x:%d y:%d\n", x, y);
+#endif
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+
+  int2 coords_bias0 = (int2)(x / w * 4, 0);
+  int2 coords_bias1 = (int2)(x / w * 4 + 1, 0);
+  int2 coords_bias2 = (int2)(x / w * 4 + 2, 0);
+  int2 coords_bias3 = (int2)(x / w * 4 + 3, 0);
+
+  CL_DTYPE4 b0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias0);
+  CL_DTYPE4 b1 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias1);
+  CL_DTYPE4 b2 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias2);
+  CL_DTYPE4 b3 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias3);
+
+  CL_DTYPE4 biase = {b0.x, b1.x, b2.x, b3.x};
+  CL_DTYPE4 output = mad(in, biase, 0);
+
+#ifdef DEBUG
+  if (x == 0 && y == 0) {
+    printf("w:%d\n", w);
+
+    printf("biase:%.1f %.1f %.1f %.1f\n", biase.x, biase.y, biase.z, biase.w);
+    printf("output:%.1f %.1f %.1f %.1f\n", output.x, output.y, output.z, output.w);
+
+    coords.x = 0;
+    coords.y = 0;
+    in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+    printf("in(%d,%d):%.2f %.2f %.2f %.2f\n", coords.x, coords.y, in.x, in.y, in.z, in.w);
+    coords.x = 0;
+    coords.y = 1;
+    in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+    printf("in(%d,%d):%.2f %.2f %.2f %.2f\n", coords.x, coords.y, in.x, in.y, in.z, in.w);
+    coords.x = 1;
+    coords.y = 0;
+    in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+    printf("in(%d,%d):%.2f %.2f %.2f %.2f\n", coords.x, coords.y, in.x, in.y, in.z, in.w);
+    coords.x = 1;
+    coords.y = 1;
+    in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+    printf("in(%d,%d):%.2f %.2f %.2f %.2f\n", coords.x, coords.y, in.x, in.y, in.z, in.w);
+
+    coords_bias.x = 0;
+    coords_bias.y = 0;
+    biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+    printf("biase(%d,%d):%.2f %.2f %.2f %.2f\n", coords_bias.x, coords_bias.y, biase.x, biase.y, biase.z, biase.w);
+    coords_bias.x = 1;
+    coords_bias.y = 0;
+    biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+    printf("biase(%d,%d):%.2f %.2f %.2f %.2f\n", coords_bias.x, coords_bias.y, biase.x, biase.y, biase.z, biase.w);
+    coords_bias.x = 2;
+    coords_bias.y = 0;
+    biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+    printf("biase(%d,%d):%.2f %.2f %.2f %.2f\n", coords_bias.x, coords_bias.y, biase.x, biase.y, biase.z, biase.w);
+  }
+#endif
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+
+
+__kernel void channel_mul_d2_hw(__read_only image2d_t input,
+                                __read_only image2d_t bias,
+                                __write_only image2d_t outputImage,
+                                int w,
+                                int h) {
   int x = get_global_id(0);
   int y = get_global_id(1);
 
@@ -75,8 +296,11 @@ __kernel void channel_mul_d2(__read_only image2d_t input, __read_only image2d_t
   WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
 }
 
-__kernel void channel_mul_d4(__read_only image2d_t input, __read_only image2d_t bias,
-                             __write_only image2d_t outputImage, int w) {
+
+__kernel void channel_mul_d4(__read_only image2d_t input,
+                             __read_only image2d_t bias,
+                             __write_only image2d_t outputImage,
+							 int w) {
   int x = get_global_id(0);
   int y = get_global_id(1);
 
@@ -97,4 +321,4 @@ __kernel void channel_mul_d4(__read_only image2d_t input, __read_only image2d_t
 
   WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
 }
-
+#endif
diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..6ed6af298f23bcfb396aefe7593ccfd52c732937
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl
@@ -0,0 +1,85 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void elementwise_sub(__read_only image2d_t input,
+                              __read_only image2d_t bias,
+                              __write_only image2d_t outputImage) {
+     int x = get_global_id(0);
+     int y = get_global_id(1);
+
+     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+     int2 coords;
+     coords.x = x;
+     coords.y = y;
+
+     CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+     CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords);
+     CL_DTYPE4 output = activation_type4(in - biase);
+
+     WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage,coords,output);
+ }
+
+__kernel void channel_sub(__read_only image2d_t input,
+                          __read_only image2d_t bias,
+                          __write_only image2d_t outputImage,
+                          int w) {
+     int x = get_global_id(0);
+     int y = get_global_id(1);
+
+     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+     int2 coords;
+     coords.x = x;
+     coords.y = y;
+
+     int2 coords_bias;
+     coords_bias.x = x % w;
+     coords_bias.y = 0;
+
+     CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+     CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+     CL_DTYPE4 output = in - (CL_DTYPE4)(biase.x);
+
+     WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+ }
+
+__kernel void width_sub(__read_only image2d_t input,
+                        __read_only image2d_t bias,
+                        __write_only image2d_t outputImage,
+                        int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+
+  int2 coords_bias;
+  coords_bias.x = x % w;
+  coords_bias.y = 0;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+  CL_DTYPE4 output;
+
+  output.x = in.x - biase.x;
+  output.y = in.y - biase.x;
+  output.z = in.z - biase.x;
+  output.w = in.w - biase.x;
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
diff --git a/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl b/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..360d8c753ef64b1da2ff2aeebddd94ff0f41db96
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl
@@ -0,0 +1,168 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+__kernel void grid_sampler(__read_only image2d_t input,
+                                __read_only image2d_t grid,
+                                __write_only image2d_t output,
+                                __private const int out_height,
+                                __private const int out_width) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2) * 4;
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords1, coords2, outpoints;
+  coords1.x = out_h / 4 * 2;
+  coords1.y = out_n * out_width + out_w;
+  coords2.x = coords1.x + 1;
+  coords2.y = coords1.y;
+  outpoints.x = out_c * out_width + out_w;
+  outpoints.x = out_n * out_height + out_h;
+  
+  CL_DTYPE4 g1 = READ_IMG_TYPE(CL_DTYPE_CHAR, grid, sampler, coords1);
+  CL_DTYPE4 g2 = READ_IMG_TYPE(CL_DTYPE_CHAR, grid, sampler, coords2);
+  
+  // x
+  float x = (g1.x + 1) * (out_width - 1) * 0.5;
+  float y = (g2.x + 1) * (out_height - 1) * 0.5;
+  int x0 = floor(x);
+  int y0 = floor(y);
+  int x_p = out_c * out_width + x0;
+  int y_p = out_n * out_height + y0;
+
+  float xs = x - x0;
+  float xe = x0 + 1 - x;
+  float ys = y - y0;
+  float ye = y0 + 1 - y;
+
+  CL_DTYPE4 input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p));
+  CL_DTYPE4 input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p));
+  CL_DTYPE4 input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p + 1));
+  CL_DTYPE4 input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p + 1));
+  
+  if (x0 < 0 || x0 > out_width - 1 || y0 < 0 || y0 > out_height - 1){
+      input0 = (CL_DTYPE4)(0.0);
+  }
+  if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 < 0 || y0 > out_height - 1){
+      input1 = (CL_DTYPE4)(0.0);
+  }
+  if (x0 < 0 || x0 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
+      input2 = (CL_DTYPE4)(0.0);
+  }
+  if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
+      input3 = (CL_DTYPE4)(0.0);
+  }
+  CL_DTYPE4 out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, outpoints, out_val);
+ 
+  // y
+  x = (g1.y + 1) * (out_width - 1) / 2;
+  y = (g2.y + 1) * (out_height - 1) / 2;
+  x0 = floor(x);
+  y0 = floor(y);
+  x_p = out_c * out_width + x0;
+  y_p = out_n * out_height + y0;
+
+  xs = x - x0;
+  xe = x0 + 1 - x;
+  ys = y - y0;
+  ye = y0 + 1 - y;
+
+  input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p));
+  input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p));
+  input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p + 1));
+  input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p + 1));
+
+  if (x0 < 0 || x0 > out_width - 1 || y0 < 0 || y0 > out_height - 1){
+      input0 = (CL_DTYPE4)(0.0);
+  }
+  if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 < 0 || y0 > out_height - 1){
+      input1 = (CL_DTYPE4)(0.0);
+  }
+  if (x0 < 0 || x0 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
+      input2 = (CL_DTYPE4)(0.0);
+  }
+  if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
+      input3 = (CL_DTYPE4)(0.0);
+  }
+
+  out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 1), out_val);
+
+  // z
+  x = (g1.z + 1) * (out_width - 1) / 2;
+  y = (g2.z + 1) * (out_height - 1) / 2;
+  x0 = floor(x);
+  y0 = floor(y);
+  x_p = out_c * out_width + x0;
+  y_p = out_n * out_height + y0;
+
+  xs = x - x0;
+  xe = x0 + 1 - x;
+  ys = y - y0;
+  ye = y0 + 1 - y;
+
+  input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p));
+  input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p));
+  input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p + 1));
+  input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p + 1));
+  
+  if (x0 < 0 || x0 > out_width - 1 || y0 < 0 || y0 > out_height - 1){
+      input0 = (CL_DTYPE4)(0.0);
+  }
+  if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 < 0 || y0 > out_height - 1){
+      input1 = (CL_DTYPE4)(0.0);
+  }
+  if (x0 < 0 || x0 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
+      input2 = (CL_DTYPE4)(0.0);
+  }
+  if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
+      input3 = (CL_DTYPE4)(0.0);
+  }
+  out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 2), out_val);
+
+  // w
+  x = (g1.w + 1) * (out_width - 1) / 2;
+  y = (g2.w + 1) * (out_height - 1) / 2;
+  x0 = floor(x);
+  y0 = floor(y);
+  x_p = out_c * out_width + x0;
+  y_p = out_n * out_height + y0;
+  
+  xs = x - x0;
+  xe = x0 + 1 - x;
+  ys = y - y0;
+  ye = y0 + 1 - y;
+
+  input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p));
+  input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p));
+  input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p, y_p + 1));
+  input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x_p + 1, y_p + 1));
+  
+  if (x0 < 0 || x0 > out_width - 1 || y0 < 0 || y0 > out_height - 1){
+      input0 = (CL_DTYPE4)(0.0);
+  }
+  if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 < 0 || y0 > out_height - 1){
+      input1 = (CL_DTYPE4)(0.0);
+  }
+  if (x0 < 0 || x0 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
+      input2 = (CL_DTYPE4)(0.0);
+  }
+  if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){
+      input3 = (CL_DTYPE4)(0.0);
+  }
+  out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 3), out_val);
+}
diff --git a/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl b/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..3e3d65394f9924edac735084c2fe5ce550f20684
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl
@@ -0,0 +1,192 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+// onnx/pytorch instancenorm by lijian
+__kernel void instance_norm_onnx(__private const int in_width,
+                                 __private const int in_height,
+                                 __private const int in_c_group,
+                                 __private const int local_work_size_x,
+                                 __private const int local_work_size_y,
+                                 __private const float epsilon,
+                                 __read_only image2d_t input,
+                                 __write_only image2d_t output) {
+  const int out_cn = get_global_id(0);
+  const int n = out_cn / in_c_group;
+  const int c = out_cn % in_c_group;
+  const int w = get_local_id(1);
+  const int h = get_local_id(2);
+  const int local_id = w * local_work_size_y + h;
+  const int local_total_size = local_work_size_x * local_work_size_y;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+#ifdef LOCAL_MEM_128
+  __local float4 shared_mem[128];
+#elif defined(LOCAL_MEM_64)
+  __local float4 shared_mem[64];
+#else
+  __local float4 shared_mem[256];
+#endif
+  int xOffset = c * in_width;
+  int yOffset = n * in_height;
+  float4 sum = 0.0f;
+  for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
+    for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
+      sum += read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex));
+    }
+  }
+  shared_mem[local_id] = sum;
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  sum = 0.0f;
+  if (local_id < 32) {
+    for (int i = local_id + 32; i < local_total_size; i += 32) {
+      sum += shared_mem[i];
+    }
+  }
+  shared_mem[local_id] += sum;
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  sum = 0.0f;
+  if (local_id == 0) {
+    int top = min(32, local_total_size);
+    for (int i = 0; i < top; i += 1) {
+      sum += shared_mem[i];
+    }
+    shared_mem[0] = sum / (in_width * in_height);
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  const float4 mean_val = shared_mem[0];
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  sum = 0.0f;
+  for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
+    for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
+      float4 temp = read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex)) - mean_val;
+      sum += temp * temp;
+    }
+  }
+  shared_mem[local_id] = sum;
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  sum = 0.0f;
+  if (local_id < 32) {
+    for (int i = local_id + 32; i < local_total_size; i += 32) {
+      sum += shared_mem[i];
+    }
+  }
+  shared_mem[local_id] += sum;
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  sum = 0.0f;
+  if (local_id == 0) {
+    int top = min(32, local_total_size);
+    for (int i = 0; i < top; i += 1) {
+      sum += shared_mem[i];
+    }
+    shared_mem[0] = sum / (in_width * in_height);
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  const float4 sigma = sqrt(shared_mem[0] + (float4)(epsilon));
+
+  float4 s = 1 / sigma;
+
+  for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
+    for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
+      int2 intout_pos = (int2)(xOffset + xIndex, yOffset + yIndex);
+      float4 in_val = read_imagef(input, sampler, intout_pos);
+      half4 out_val = convert_half4((in_val - mean_val) * s);
+#ifdef RELU
+      out_val = activation(out_val);
+#endif
+      write_imageh(output, intout_pos, out_val);
+    }
+  }
+}
+
+
+// paddle instancenorm by zhangxi
+__kernel void instance_norm_paddle(__read_only image2d_t input,
+                                   __write_only image2d_t output,
+                                   __read_only image2d_t scale,
+                                   __read_only image2d_t bias,
+                                   const float epsilon,
+                                   const int in_h,
+                                   const int in_w){
+    __local CL_DTYPE4 saved_mean[1024];
+    __local CL_DTYPE4 saved_variance[1024];
+    const int lid = get_local_id(0);
+    const int lsize = get_local_size(0);
+    const int gidx = get_group_id(0);
+    const int gidy = get_group_id(1);
+    const int spatial_size = in_h * in_w;
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+    CL_DTYPE4 mean = (CL_DTYPE4)(0.f, 0.f, 0.f, 0.f);
+    CL_DTYPE4 variance = (CL_DTYPE4)(0.f, 0.f, 0.f, 0.f);
+    CL_DTYPE4 vepsilon = (CL_DTYPE4)(epsilon, epsilon, epsilon, epsilon);
+    const int x_offset = gidx * in_w;
+    const int y_offset = gidy * in_h;
+    int2 coor;
+    for (int i = lid; i < spatial_size; i += lsize) {
+        coor.x = i % in_w + x_offset;
+        coor.y = i / in_w + y_offset;
+        CL_DTYPE4 pixel = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coor);
+        mean += pixel;
+        variance += pixel * pixel;
+    }
+    saved_mean[lid] = mean;
+    saved_variance[lid] = variance;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    //! do reduction
+    int dynamic_size = lsize >> 1;
+    for (; dynamic_size > 0; dynamic_size >>= 1){
+        if (lid < dynamic_size) {
+          saved_mean[lid] += saved_mean[lid + dynamic_size];
+          saved_variance[lid] += saved_variance[lid + dynamic_size];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    mean = saved_mean[0] / spatial_size;
+    variance = saved_variance[0] / spatial_size - mean * mean;
+    variance = rsqrt(variance + vepsilon);
+    
+    //! do instance norm
+    coor.x = gidx;
+    coor.y = gidy;
+    CL_DTYPE4 vscale = READ_IMG_TYPE(CL_DTYPE_CHAR, scale, sampler, coor);
+    vscale *= variance;
+    CL_DTYPE4 vbias = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coor);
+    for (int i = lid; i < spatial_size; i += lsize) {
+        coor.x = i % in_w + x_offset;
+        coor.y = i / in_w + y_offset;
+        CL_DTYPE4 pixel = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coor);
+        pixel = (pixel - mean) * vscale + vbias;
+        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, coor, pixel);
+    }
+}
diff --git a/lite/backends/opencl/cl_kernel/image/layout_kernel.cl b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..6c419fe3c134614d28b3bcee3eabac5e8f7bdf6e
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl
@@ -0,0 +1,298 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+// #define DEBUG
+////////////////////////////////////////////////////////
+// buffer -> image2d
+////////////////////////////////////////////////////////
+__kernel void buffer_to_image2d(__global CL_DTYPE *in,
+                                __write_only image2d_t output_image,
+                                __private const int out_H,
+                                __private const int out_W,
+                                __private const int out_C,
+                                __private const int Stride0,
+                                __private const int Stride1,
+                                __private const int Stride2) {
+
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const int out_n = out_nh / out_H;
+  const int out_h = out_nh % out_H;
+
+  const int in_n = out_n;
+  const int in_c0 = out_c * 4 + 0;
+  const int in_c1 = out_c * 4 + 1;
+  const int in_c2 = out_c * 4 + 2;
+  const int in_c3 = out_c * 4 + 3;
+  const int in_h = out_h;
+  const int in_w = out_w;
+
+  int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w;
+  int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w;
+  int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w;
+  int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w;
+
+  int2 output_pos;
+  output_pos.x = out_c * out_W + out_w;
+  output_pos.y = out_nh;
+
+  CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)(0.f, 0.f, 0.f, 0.f);
+  output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE);
+
+  if (out_C - 4 * out_c >= 2) {
+    output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE);
+  }
+  if (out_C - 4 * out_c >= 3) {
+    output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE);
+  }
+  if (out_C - 4 * out_c >= 4) {
+    output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE);
+  }
+
+#ifdef DEBUG
+  if (out_w > 2045) {
+    printf("out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f %.2f\n",
+		   out_w,
+           out_C - 4 * out_c,
+           (float)(in[input_pos0]),
+           (float)(in[input_pos1]),
+           (float)(in[input_pos2]),
+           (float)(in[input_pos3]));
+    printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n", out_c, out_w, out_nh,
+           output_pos.x, output_pos.y,
+           (float)(output.x), (float)(output.y), (float)(output.z), (float)(output.w));
+  }
+#endif
+
+  WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output);
+}
+
+////////////////////////////////////////////////////////
+// image2d -> buffer
+////////////////////////////////////////////////////////
+__kernel void image2d_to_buffer(__read_only image2d_t input,
+                                __private const int in_width,
+                                __private const int in_height,
+                                __global CL_DTYPE* out,
+                                __private const int size_ch,
+                                __private const int size_block,
+                                __private const int size_batch,
+                                __private const int C) {
+  const int in_c = get_global_id(0);
+  const int in_w = get_global_id(1);
+  const int in_nh = get_global_id(2);
+
+  const int in_n = in_nh / in_height;
+  const int in_h = in_nh % in_height;
+
+  const sampler_t sampler =
+    CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  const int pos_x = mad24(in_c, in_width, in_w);
+  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
+
+#ifdef DEBUG
+  if (in_w > 2045) {
+    printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n", in_c, in_w, in_nh,
+            pos_x, in_nh,
+           (float)(in.x), (float)(in.y), (float)(in.z), (float)(in.w));
+  }
+#endif
+
+  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
+  out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE);
+  if (C - 4 * in_c >= 2) {
+    out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE);
+  }
+  if(C - 4 * in_c >= 3) {
+    out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE);
+  }
+  if(C - 4 * in_c >= 4) {
+    out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE);
+  }
+}
+
+
+#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
+////////////////////////////////////////////////////////
+// buffer -> image2d_nw
+////////////////////////////////////////////////////////
+__kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
+                                   __write_only image2d_t output_image,
+                                   __private const int out_H,
+                                   __private const int out_W,
+                                   __private const int out_N,
+                                   __private const int Stride0,
+                                   __private const int Stride1,
+                                   __private const int Stride2) {
+  const int out_n = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_ch = get_global_id(2);
+
+  const int out_c = out_ch / out_H;
+  const int out_h = out_ch % out_H;
+
+  const int in_c = out_c; //  index of c in h direction
+
+  const int in_n0 = out_n * 4 + 0;
+  const int in_n1 = out_n * 4 + 1;
+  const int in_n2 = out_n * 4 + 2;
+  const int in_n3 = out_n * 4 + 3;
+
+  const int in_h = out_h;
+  const int in_w = out_w;
+
+  int input_pos0 = in_n0 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w;
+  int input_pos1 = in_n1 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w;
+  int input_pos2 = in_n2 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w;
+  int input_pos3 = in_n3 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w;
+
+  int2 output_pos;
+  output_pos.x = out_n * out_W + out_w;
+  output_pos.y = out_ch;
+
+  CL_DTYPE4 output = (CL_DTYPE4)0.0f;
+  output.x = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos0]);
+  if (out_N - 4 * out_n >= 2) {
+    output.y = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos1]);
+  }
+  if (out_N - 4 * out_n >= 3) {
+    output.z = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos2]);
+  }
+  if (out_N - 4 * out_n >= 4) {
+    output.w = CONVERT_TYPE_TO(CL_DTYPE, in[input_pos3]);
+  }
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+}
+#endif
+
+
+#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
+// image2d -> buffer
+__kernel void image2d_to_buffer_2d(__private const int in_height,
+                                   __private const int in_width,
+                                   __read_only image2d_t input,
+                                   __global CL_DTYPE* out) {
+  const int in_w = get_global_id(1);
+  const int in_h = get_global_id(2);
+
+  const sampler_t sampler =
+    CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(in_w, in_h));
+
+  const int index = (in_h * in_width + in_w) * 4;
+  out[index] = CONVERT_TYPE_TO(CL_DTYPE, in.x);
+  out[index + 1] = CONVERT_TYPE_TO(CL_DTYPE, in.y);
+  out[index + 2] = CONVERT_TYPE_TO(CL_DTYPE, in.z);
+  out[index + 3] = CONVERT_TYPE_TO(CL_DTYPE, in.w);
+}
+#endif
+
+////////////////////////////////////////////////////////
+// buffer -> image2d (divide by 255 to normalize)
+////////////////////////////////////////////////////////
+__kernel void buffer_to_image2d_with_pre255(__global uchar *in,
+                                            __write_only image2d_t output_image,
+                                            __private const int out_H,
+                                            __private const int out_W,
+                                            __private const int out_C,
+                                            __private const int Stride0,
+                                            __private const int Stride1,
+                                            __private const int Stride2){
+
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = out_nh / out_H;
+  const int out_h = out_nh % out_H;
+
+  const int in_n = out_n;
+  const int in_c0 = out_c * 4 + 0;
+  const int in_c1 = out_c * 4 + 1;
+  const int in_c2 = out_c * 4 + 2;
+  const int in_c3 = out_c * 4 + 3;
+  const int in_h = out_h;
+  const int in_w = out_w;
+
+
+  int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w;
+  int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w;
+  int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w;
+  int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w;
+
+  int2 output_pos;
+  output_pos.x = out_c * out_W + out_w;
+  output_pos.y = out_nh;
+
+  CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)0.0f;
+  output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE) / 255;
+  if(out_C - 4 * out_c>=2){
+      output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255;
+  }
+  if(out_C - 4 * out_c>=3){
+      output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255;
+  }
+  if(out_C - 4 * out_c>=4){
+      output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255;
+  }
+  WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output);
+}
+
+
+////////////////////////////////////////////////////////
+// image2d -> buffer (multiply by 255 to de-normalize)
+////////////////////////////////////////////////////////
+__kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
+                                            __private const int in_width,
+                                            __private const int in_height,
+                                            __global uchar* out,
+                                            __private const int size_ch,
+                                            __private const int size_block,
+                                            __private const int size_batch,
+                                            __private const int C) {
+  const int in_c = get_global_id(0);
+  const int in_w = get_global_id(1);
+  const int in_nh = get_global_id(2);
+  const int in_n = in_nh / in_height;
+  const int in_h = in_nh % in_height;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  const int pos_x = mad24(in_c, in_width, in_w);
+  CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) * 255;
+
+#ifdef DEBUG
+  printf("in_c:%d, in_w:%d, in_nh:%d ===> in(%d,%d): %.2f %.2f %.2f %.2f\n",
+          in_c, in_w, in_nh, pos_x, in_nh, in.x, in.y, in.z, in.w);
+#endif
+
+  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
+  out[index] = convert_uchar_sat(in.x);
+  if(C - 4 * in_c>=2){
+    out[index + size_ch] = convert_uchar_sat(in.y);
+  }
+  if(C - 4 * in_c>=3){
+    out[index + size_ch * 2] = convert_uchar_sat(in.z);
+  }
+  if(C - 4 * in_c>=4){
+    out[index + size_ch * 3] = convert_uchar_sat(in.w);
+  }
+}
diff --git a/lite/backends/opencl/cl_kernel/image/lrn_kernel.cl b/lite/backends/opencl/cl_kernel/image/lrn_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..655a2657e07c419d4e50aed0e78cb8c37afa4b2a
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/lrn_kernel.cl
@@ -0,0 +1,159 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void lrn(__read_only image2d_t input,
+                        __write_only image2d_t output,
+                        __private const int out_C,
+                        __private const int out_W,
+                        __private const int local_size,
+                        __private const float k,
+                        __private const float alpha,
+                        __private const float beta){
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const int out_c0 = out_c * 4;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+
+  const int out_c1 = out_c0 + 1;
+  const int out_c2 = out_c0 + 2;
+  const int out_c3 = out_c0 + 3;
+
+  const int pad = (local_size - 1) / 2;
+  const int start = out_c0 - pad;
+  const int end = out_c0 + pad;
+  start = start > 0 ? start : 0;
+  end = end < out_C - 1 ? end : out_C - 1; 
+  float square0 = 0.0;
+  float square1 = 0.0;
+  float square2 = 0.0;
+  float square3 = 0.0;
+  for (int i = start; i <= end; i++){
+      int input_c0 = i / 4;
+      int2 input_pos;
+      input_pos.x = input_c0 * out_C + out_w;
+      input_pos.y = out_nh;
+      CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+      int num = i % 4;
+      switch (num){
+        case 0:
+            square0 += input_data.x * input_data.x;
+            break;
+        case 1:
+            square0 += input_data.y * input_data.y;
+            break;
+        case 2:
+            square0 += input_data.z * input_data.z;
+            break;
+        case 3:
+            square0 += input_data.w * input_data.w;
+            break;
+      }
+  }
+  start = out_c1 - pad;
+  end = out_c1 + pad;
+  for (int i = start; i <= end; i++){
+      int input_c0 = i / 4;
+      int2 input_pos;
+      input_pos.x = input_c0 * out_C + out_w;
+      input_pos.y = out_nh;
+      CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+      int num = i % 4;
+      switch (num){
+        case 0:
+            square1 += input_data.x * input_data.x;
+            break;
+        case 1:
+            square1 += input_data.y * input_data.y;
+            break;
+        case 2:
+            square1 += input_data.z * input_data.z;
+            break;
+        case 3:
+            square1 += input_data.w * input_data.w;
+            break;
+      }
+  }
+  start = out_c2 - pad;
+  end = out_c2 + pad;
+  for (int i = start; i <= end; i++){
+      int input_c0 = i / 4;
+      int2 input_pos;
+      input_pos.x = input_c0 * out_C + out_w;
+      input_pos.y = out_nh;
+      CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+      int num = i % 4;
+      switch (num){
+        case 0:
+            square2 += input_data.x * input_data.x;
+            break;
+        case 1:
+            square2 += input_data.y * input_data.y;
+            break;
+        case 2:
+            square2 += input_data.z * input_data.z;
+            break;
+        case 3:
+            square2 += input_data.w * input_data.w;
+            break;
+      }
+  }
+  start = out_c3 - pad;
+  end = out_c3 + pad;
+  for (int i = start; i <= end; i++){
+      int input_c0 = i / 4;
+      int2 input_pos;
+      input_pos.x = input_c0 * out_C + out_w;
+      input_pos.y = out_nh;
+      CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, input_pos);
+      int num = i % 4;
+      switch (num){
+        case 0:
+            square3 += input_data.x * input_data.x;
+            break;
+        case 1:
+            square3 += input_data.y * input_data.y;
+            break;
+        case 2:
+            square3 += input_data.z * input_data.z;
+            break;
+        case 3:
+            square3 += input_data.w * input_data.w;
+            break;
+      }
+  }
+  int2 out_pos;
+  out_pos.x = out_c * out_W + out_w;
+  out_pos.y = out_nh;
+  CL_DTYPE4 input = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, out_pos);
+
+  float4 out_val;
+  out_val.x = input.x / (pow(k + alpha * (square0), beta));
+  if (out_c1 < out_C){
+      out_val.y = input.y / (pow(k + alpha * (square1), beta));
+  }
+  if (out_c2 < out_C){
+      out_val.z = input.z / (pow(k + alpha * (square1), beta));
+  }
+  if (out_c3 < out_C){
+      out_val.w = input.w / (pow(k + alpha * (square1), beta));
+  }
+  CL_DTYPE4 out_data = CONVERT_TYPE_TO(out_val, CL_DTYPE4);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, out_pos, out_data);
+}
diff --git a/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl b/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl
index b74449d9c8a02551cd74d366849768b4a91a4dce..1df1f0c18b7abb7e715716856dbec7c7d4d5108a 100644
--- a/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl
@@ -12,26 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t output,
-                             __private const float scale_h, __private const float scale_w,
-                             __private const int in_dims_h, __private const int out_dims_h,
-                             __private const int in_dims_w, __private const int out_dims_w) {
-                             const int c = get_global_id(0);
-                             const int w = get_global_id(1);
-                             const int nh = get_global_id(2);
-                             int2 output_pos;
-                             output_pos.x = c * out_dims_w + w;
-                             output_pos.y = nh;
-                             int out_n = nh / out_dims_h;
-                             int out_h = nh % out_dims_h;
-                             int2 input_pos;
-                             input_pos.x = c * in_dims_w + w / scale_w;
-                             input_pos.y = out_n * in_dims_h + out_h / scale_h;
-
-                             const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                             CLK_ADDRESS_CLAMP |
-                                                             CLK_FILTER_NEAREST;
-                             half4 input_data = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y));
-                             write_imageh(output, (int2)(output_pos.x , output_pos.y), input_data);
+#include <cl_common.h>
+
+
+__kernel void nearest_interp(__read_only image2d_t input,
+                             __write_only image2d_t output,
+                             __private const float scale_h,
+                             __private const float scale_w,
+                             __private const int in_dims_h,
+                             __private const int out_dims_h,
+                             __private const int in_dims_w,
+                             __private const int out_dims_w) {
+
+  const int c = get_global_id(0);
+  const int w = get_global_id(1);
+  const int nh = get_global_id(2);
+
+  int2 output_pos;
+  output_pos.x = c * out_dims_w + w;
+  output_pos.y = nh;
+
+  int out_n = nh / out_dims_h;
+  int out_h = nh % out_dims_h;
+
+  int2 input_pos;
+  input_pos.x = c * in_dims_w + w / scale_w;
+  input_pos.y = out_n * in_dims_h + out_h / scale_h;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                            CLK_ADDRESS_CLAMP |
+                            CLK_FILTER_NEAREST;
+  CL_DTYPE4 input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(input_pos.x, input_pos.y));
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(output_pos.x , output_pos.y), input_data);
 }
diff --git a/lite/backends/opencl/cl_kernel/image/pad2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/pad2d_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..e65aad3d19bc674aff2f71d2403e611cd247abf1
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/pad2d_kernel.cl
@@ -0,0 +1,108 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void pad2d_constant(
+    __read_only image2d_t input, __write_only image2d_t output,
+    const int in_height, const int in_width,
+    const int out_height, const int out_width,
+    const int pad_h0, const int pad_h1,
+    const int pad_w0, const int pad_w1,
+    const float pad_value) {
+        
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+
+  int2 output_pos = (int2)(mad24(out_c, out_width, out_w), out_nh);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int x = out_w - pad_w0;
+  int y = out_h - pad_h0;
+
+  if (x < 0 || y < 0 || x >= in_width || y >= in_height) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, (CL_DTYPE4)(pad_value));
+  } else {
+    int2 coor = (int2)(out_c * in_width + x, out_n * in_height + y);
+    CL_DTYPE4 pixel = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coor);
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, pixel);
+  }
+}
+
+__kernel void pad2d_reflect(
+    __read_only image2d_t input, __write_only image2d_t output,
+    const int in_height, const int in_width,
+    const int out_height, const int out_width,
+    const int pad_h0, const int pad_h1,
+    const int pad_w0, const int pad_w1,
+    const float pad_value) {
+        
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+
+  int2 output_pos = (int2)(mad24(out_c, out_width, out_w), out_nh);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int x = out_w - pad_w0;
+  int y = out_h - pad_h0;
+
+  x = abs(x);
+  y = abs(y);
+  x = x < in_width ? x : 2 * in_width - 2 - x;
+  y = y < in_height ? y : 2 * in_height - 2 - y;
+  int2 coor = (int2)(out_c * in_width + x, out_n * in_height + y);
+  CL_DTYPE4 pixel = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coor);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, pixel);
+}
+
+__kernel void pad2d_edge(
+    __read_only image2d_t input, __write_only image2d_t output,
+    const int in_height, const int in_width,
+    const int out_height, const int out_width,
+    const int pad_h0, const int pad_h1,
+    const int pad_w0, const int pad_w1,
+    const float pad_value) {
+        
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+
+  int2 output_pos = (int2)(mad24(out_c, out_width, out_w), out_nh);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int x = out_w - pad_w0;
+  int y = out_h - pad_h0;
+
+  x = x > 0 ? x : 0;
+  x = x < in_width ? x : in_width - 1;
+  y = y > 0 ? y : 0;
+  y = y < in_height ? y : in_height - 1;
+  int2 coor = (int2)(out_c * in_width + x, out_n * in_height + y);
+  CL_DTYPE4 pixel = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coor);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, pixel);
+}
diff --git a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl
index 775166261d01dc639cd5af8cee49f7e7fb30cb19..f64c2b5e7b21d81a50acd485938ca4f74c3f013b 100644
--- a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl
@@ -15,17 +15,17 @@ limitations under the License. */
 #include <cl_common.h>
 
 __kernel void pool_max(__read_only image2d_t input,
-    __write_only image2d_t output,
-    __private const int in_height,
-    __private const int in_width,
-    __private const int out_height,
-    __private const int out_width,
-    __private const int ksize_h,
-    __private const int ksize_w,
-    __private const int stride_h,
-    __private const int stride_w,
-    __private const int pad_top,
-    __private const int pad_left) {
+                       __write_only image2d_t output,
+                       __private const int in_height,
+                       __private const int in_width,
+                       __private const int out_height,
+                       __private const int out_width,
+                       __private const int ksize_h,
+                       __private const int ksize_w,
+                       __private const int stride_h,
+                       __private const int stride_w,
+                       __private const int pad_top,
+                       __private const int pad_left) {
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
   const int out_nh = get_global_id(2);
@@ -37,18 +37,19 @@ __kernel void pool_max(__read_only image2d_t input,
 
   int start_h = out_h * stride_h - pad_top;
   int end_h = min(start_h + ksize_h, in_height);
-  start_h = max(start_h,0);
+  start_h = max(start_h, 0);
 
   int start_w = out_w * stride_w - pad_left;
   int end_w = min(start_w + ksize_w, in_width);
-  start_w = max(start_w,0);
+  start_w = max(start_w, 0);
 
   const int pos_in_x = out_c * in_width;
   const int pos_in_y = out_n * in_height;
   CL_DTYPE4 max_value = (CL_DTYPE4)(MIN_VALUE);
   for (int y = start_h; y < end_h; ++y) {
     for (int x = start_w; x < end_w; ++x) {
-      CL_DTYPE4 tmp = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
+      CL_DTYPE4 tmp = READ_IMG_TYPE(
+          CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
       max_value = max(max_value, tmp);
     }
   }
@@ -58,17 +59,17 @@ __kernel void pool_max(__read_only image2d_t input,
 }
 
 __kernel void pool_avg(__read_only image2d_t input,
-  __write_only image2d_t output,
-  __private const int in_height,
-  __private const int in_width,
-  __private const int out_height,
-  __private const int out_width,
-  __private const int ksize_h,
-  __private const int ksize_w,
-  __private const int stride_h,
-  __private const int stride_w,
-  __private const int pad_top,
-  __private const int pad_left) {
+                       __write_only image2d_t output,
+                       __private const int in_height,
+                       __private const int in_width,
+                       __private const int out_height,
+                       __private const int out_width,
+                       __private const int ksize_h,
+                       __private const int ksize_w,
+                       __private const int stride_h,
+                       __private const int stride_w,
+                       __private const int pad_top,
+                       __private const int pad_left) {
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
   const int out_nh = get_global_id(2);
@@ -90,10 +91,121 @@ __kernel void pool_avg(__read_only image2d_t input,
 
   for (int y = start_h; y < end_h; ++y) {
     for (int x = start_w; x < end_w; ++x) {
-      sum += READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
+      sum += READ_IMG_TYPE(
+          CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
     }
   }
   CL_DTYPE4 avg = sum / (ksize_h * ksize_w);
   const int pos_out_x = mad24(out_c, out_width, out_w);
   WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_out_x, out_nh), avg);
 }
+
+__kernel void pool_avg_global(__read_only image2d_t input,
+                              __write_only image2d_t output,
+                              __private const int in_height,
+                              __private const int in_width,
+                              __private const int out_height,
+                              __private const int out_width,
+                              __private const int ksize_h,
+                              __private const int ksize_w,
+                              __private const int stride_h,
+                              __private const int stride_w,
+                              __private const int pad_top,
+                              __private const int pad_left) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);   // =1
+  const int out_nh = get_global_id(2);  // = n*1
+
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  // do not use dtype4 here
+  // skip issue for half 2048
+  float4 sum = (float4)(0.0f);
+
+  const int pos_in_x = out_c * in_width;
+  const int pos_in_y = out_n * in_height;
+  for (int y = 0; y < in_height; ++y) {
+    for (int x = 0; x < in_width; ++x) {
+      half4 tmp = READ_IMG_TYPE(
+          CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
+
+      sum.x = convert_float(tmp.x) + sum.x;
+      sum.y = convert_float(tmp.y) + sum.y;
+      sum.z = convert_float(tmp.z) + sum.z;
+      sum.w = convert_float(tmp.w) + sum.w;
+    }
+  }
+  const float global_size_div = 1.0f / (in_height * in_width);
+  half4 avg;
+  avg.x = convert_half((sum.x * global_size_div));
+  avg.y = convert_half((sum.y * global_size_div));
+  avg.z = convert_half((sum.z * global_size_div));
+  avg.w = convert_half((sum.w * global_size_div));
+
+#ifdef DEBUG
+  if (out_c == 0) {
+    printf("\033[31msum.x= %f \033 \n  ", sum.x);
+    printf("sum.y=%f \n  ", sum.y);
+    printf("sum.z=%f \n  ", sum.z);
+    printf("sum.w=%f \n  ", sum.w);
+    printf("one4.x=%f \n  ", convert_float(one4.x));
+
+    printf("in_height=%d \n  ", in_height);
+    printf("in_width=%d \n  ", in_width);
+    printf("ksize_h=%d \n  ", ksize_h);
+    printf("ksize_w=%d \n  ", ksize_w);
+    printf("stride_h=%d \n  ", stride_h);
+    printf("stride_w=%d \n  ", stride_w);
+    printf("pad_top=%d \n  ", pad_top);
+    printf("pad_left=%d \n  ", pad_left);
+    printf("out_width=%d \n  ", out_width);
+    printf("out_height=%d \n  ", out_height);
+    printf("i++=%d \n  ", i++);
+    printf("avg.x=%f \n  ", convert_float(avg.x));
+    printf("avg.y=%f \n  ", convert_float(avg.y));
+    printf("avg.z=%f \n  ", convert_float(avg.z));
+    printf("avg.w=%f \n  ", convert_float(avg.w));
+  }
+#endif
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(out_c, out_nh), avg);
+}
+__kernel void pool_max_global(__read_only image2d_t input,
+                              __write_only image2d_t output,
+                              __private const int in_height,
+                              __private const int in_width,
+                              __private const int out_height,
+                              __private const int out_width,
+                              __private const int ksize_h,
+                              __private const int ksize_w,
+                              __private const int stride_h,
+                              __private const int stride_w,
+                              __private const int pad_top,
+                              __private const int pad_left) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);   // =1
+  const int out_nh = get_global_id(2);  // = n*1
+
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 max_value = (CL_DTYPE4)(MIN_VALUE);
+  const int pos_in_x = out_c * in_width;
+  const int pos_in_y = out_n * in_height;
+  for (int y = 0; y < in_height; ++y) {
+    for (int x = 0; x < in_width; ++x) {
+      max_value = max(max_value,
+                      READ_IMG_TYPE(CL_DTYPE_CHAR,
+                                    input,
+                                    sampler,
+                                    (int2)(pos_in_x + x, pos_in_y + y)));
+    }
+  }
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(out_c, out_nh), max_value);
+}
\ No newline at end of file
diff --git a/lite/backends/opencl/cl_kernel/image/relu6_kernel.cl b/lite/backends/opencl/cl_kernel/image/relu6_kernel.cl
deleted file mode 100644
index 7750bd98a29151ba2428bdafd462420393fe7433..0000000000000000000000000000000000000000
--- a/lite/backends/opencl/cl_kernel/image/relu6_kernel.cl
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cl_common.h>
-
-__kernel void relu6(__read_only image2d_t input,
-                    __write_only image2d_t output,
-                    __private const float threshold){
-
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-
-  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
-  in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
-  in = min((CL_DTYPE4)(threshold, threshold, threshold, threshold), in);
-  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
-}
diff --git a/lite/backends/opencl/cl_kernel/image/scale_kernel.cl b/lite/backends/opencl/cl_kernel/image/scale_kernel.cl
index 739ff1338582b65d87dbd9c92f1ea86e0c49f0ff..dfc25063cc2e36d768f1bc4d7ff992c87fe17592 100644
--- a/lite/backends/opencl/cl_kernel/image/scale_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/scale_kernel.cl
@@ -27,6 +27,6 @@ __kernel void scale(__read_only image2d_t input,
                             CLK_FILTER_NEAREST;
 
   CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
-  in = convert_float(scale) * in + convert_float(bias);
+  in = CONVERT_TYPE_TO(scale, CL_DTYPE) * in + CONVERT_TYPE_TO(bias, CL_DTYPE);
   WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }
diff --git a/lite/backends/opencl/cl_kernel/image/sigmoid_kernel.cl b/lite/backends/opencl/cl_kernel/image/sigmoid_kernel.cl
deleted file mode 100644
index d2cb8fa36e21167979172fba634a7862c932b74c..0000000000000000000000000000000000000000
--- a/lite/backends/opencl/cl_kernel/image/sigmoid_kernel.cl
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cl_common.h>
-
-__kernel void sigmoid(__read_only image2d_t input,
-                   __write_only image2d_t output) {
-
-  const int x = get_global_id(0); // image_width
-  const int y = get_global_id(1); // image_height
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-
-  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
-  CL_DTYPE4 out = 1 / (1 + exp(-in));
-  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
-}
diff --git a/lite/backends/opencl/cl_kernel/image/slice_kernel.cl b/lite/backends/opencl/cl_kernel/image/slice_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..1ef74bb14213beaa0e83e28b99b592ac1dcc667d
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/slice_kernel.cl
@@ -0,0 +1,78 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void slice(__read_only image2d_t input, __write_only image2d_t output,
+                    __private const int start, __private const int end,
+                    __private const int dims_w){
+
+                    const int c = get_global_id(0);
+                    const int w = get_global_id(1);
+                    const int nh = get_global_id(2);
+                    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                CLK_ADDRESS_CLAMP |
+                                                CLK_FILTER_NEAREST;
+
+                    int2 output_pos;
+                    output_pos.x = c * dims_w + w;
+                    output_pos.y = nh;
+
+                    int2 input_pos;
+                    half4 input_data;
+                    half4 output_data;
+
+                    if (start % 4 == 0) {
+                        input_pos.x = (4 * c + start) / 4 * dims_w + w;
+                        input_pos.y = nh;
+                        input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos);
+                        output_data = input_data;
+                    } else if (start % 4 == 1) {
+                        input_pos.x = (4 * c + start) / 4 * dims_w + w;
+                        input_pos.y = nh;
+                        input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos);
+                        output_data.x = input_data.y;
+                        output_data.y = input_data.z;
+                        output_data.z = input_data.w;
+                        input_pos.x = input_pos.x + dims_w;
+                        input_pos.y = nh;
+                        input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos);
+                        output_data.w = input_data.x;
+                    } else if (start % 4 == 2) {
+                        input_pos.x = (4 * c + start) / 4 * dims_w + w;
+                        input_pos.y = nh;
+                        input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos);
+                        output_data.x = input_data.z;
+                        output_data.y = input_data.w;
+                        input_pos.x = input_pos.x + dims_w;
+                        input_pos.y = nh;
+                        input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos);
+                        output_data.z = input_data.x;
+                        output_data.w = input_data.y;
+                    } else if (start % 4 == 3) {
+                        input_pos.x = (4 * c + start) / 4 * dims_w + w;
+                        input_pos.y = nh;
+                        input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos);
+                        output_data.x = input_data.w;
+                        input_pos.x = input_pos.x + dims_w;
+                        input_pos.y = nh;
+                        input_data = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler,input_pos);
+                        output_data.y = input_data.x;
+                        output_data.z = input_data.y;
+                        output_data.w = input_data.z;
+                    }
+                    write_imageh(output, output_pos, output_data);
+
+}
+
diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc
index 0c7b2f8575a88082f6d79a5392c4468715a701b9..63c9954f9181e9252c4d14f57b6ed29107965fe3 100644
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -75,13 +75,8 @@ cl::CommandQueue& CLRuntime::command_queue() {
 
 std::unique_ptr<cl::Program> CLRuntime::CreateProgram(
     const cl::Context& context, std::string file_name) {
-  std::ifstream file{file_name, std::ios::binary | std::ios::ate};
-  CHECK(file.is_open()) << "Can't open file from " << file_name;
-  auto size = file.tellg();
-  CHECK(size > 0) << "size is too small.";
-  std::string content(size, '\0');
-  file.seekg(0);
-  file.read(&content[0], size);
+  auto cl_file = opencl_kernels_files.find(file_name);
+  std::string content(cl_file->second.begin(), cl_file->second.end());
   cl::Program::Sources sources;
   sources.push_back(content);
   auto prog =
@@ -101,8 +96,8 @@ std::unique_ptr<cl::UserEvent> CLRuntime::CreateEvent(
 }
 
 bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) {
-  std::string build_option = options + " -cl-fast-relaxed-math -I " +
-                             CLRuntime::Global()->cl_path() + "/cl_kernel";
+  /* -I +CLRuntime::Global()->cl_path() + "/cl_kernel"*/
+  std::string build_option = options + " -cl-fast-relaxed-math ";
   VLOG(4) << "OpenCL build_option: " << build_option;
   status_ = program->build({*device_}, build_option.c_str());
   CL_CHECK_ERROR(status_);
@@ -133,6 +128,12 @@ bool CLRuntime::InitializePlatform() {
 }
 
 bool CLRuntime::InitializeDevice() {
+  // ===================== BASIC =====================
+  // CL_DEVICE_TYPE_GPU
+  // CL_DEVICE_NAME
+  // CL_DEVICE_SUPPORT
+  // CL_DEVICE_MAX_COMPUTE_UNITS
+  // CL_DEVICE_MAX_CLOCK_FREQUENCY
   std::vector<cl::Device> all_devices;
   status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices);
   CL_CHECK_ERROR(status_);
@@ -145,27 +146,153 @@ bool CLRuntime::InitializeDevice() {
 
   auto device_name = device_->getInfo<CL_DEVICE_NAME>();
   LOG(INFO) << "Using device: " << device_name;
+
+  cl_device_type device_type = device_->getInfo<CL_DEVICE_TYPE>();
+  auto device_type_to_str = [](cl_device_type t) -> std::string {
+    std::string t_str{""};
+    switch (t) {
+      case CL_DEVICE_TYPE_CPU:
+        t_str = "CPU";
+        break;
+      case CL_DEVICE_TYPE_GPU:
+        t_str = "GPU";
+        break;
+      case CL_DEVICE_TYPE_ACCELERATOR:
+        t_str = "Accelerator";
+        break;
+      case CL_DEVICE_TYPE_DEFAULT:
+        t_str = "Default";
+        break;
+      default:
+        t_str = "Unknown";
+    }
+    return t_str;
+  };
+  LOG(INFO) << "device_type:" << device_type_to_str(device_type);
+  device_info_["CL_DEVICE_TYPE"] = device_type;
+
+  auto max_units = device_->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+  LOG(INFO) << "The chosen device has " << max_units << " compute units.";
+  device_info_["CL_DEVICE_MAX_COMPUTE_UNITS"] = max_units;
+
+  auto max_clock_freq = device_->getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>();
+  LOG(INFO) << "CL_DEVICE_MAX_CLOCK_FREQUENCY:" << max_clock_freq;
+  device_info_["CL_DEVICE_MAX_CLOCK_FREQUENCY"] = max_clock_freq;
+
+  // ===================== MEMORY =====================
+  // CL_DEVICE_LOCAL_MEM_SIZE
+  // CL_DEVICE_GLOBAL_MEM_CACHE_SIZE
+  // CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE
+  // CL_DEVICE_GLOBAL_MEM_SIZE
+  auto local_mem_kb =
+      static_cast<float>(device_->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()) / 1024;
+  LOG(INFO) << "The local memory size of the chosen device is " << local_mem_kb
+            << " KB.";
+  device_info_["CL_DEVICE_LOCAL_MEM_SIZE_KB"] = local_mem_kb;
+
+  auto global_mem_cache_size_kb =
+      static_cast<float>(device_->getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>()) /
+      1024;
+  LOG(INFO) << "CL_DEVICE_GLOBAL_MEM_CACHE_SIZE(KB):"
+            << global_mem_cache_size_kb << " KB.";
+  device_info_["CL_DEVICE_GLOBAL_MEM_CACHE_SIZE_KB"] = global_mem_cache_size_kb;
+
+  auto global_mem_cacheline_size_kb =
+      static_cast<float>(
+          device_->getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>()) /
+      1024;
+  LOG(INFO) << "CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE(KB):"
+            << global_mem_cacheline_size_kb << " KB.";
+  device_info_["CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE_KB"] =
+      global_mem_cacheline_size_kb;
+
+  auto global_mem_size_kb =
+      static_cast<float>(device_->getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()) / 1024;
+  LOG(INFO) << "CL_DEVICE_GLOBAL_MEM_SIZE(KB):" << global_mem_size_kb << " KB.";
+  device_info_["CL_DEVICE_GLOBAL_MEM_SIZE_KB"] = global_mem_size_kb;
+
+  // ===================== WORK_GROUP =====================
+  // CL_DEVICE_MAX_WORK_GROUP_SIZE
+  // CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
+  // CL_DEVICE_MAX_WORK_ITEM_SIZES
+  auto max_work_group_size = device_->getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
+  LOG(INFO) << "CL_DEVICE_MAX_WORK_GROUP_SIZE:" << max_work_group_size;
+  device_info_["CL_DEVICE_MAX_WORK_GROUP_SIZE"] = max_work_group_size;
+
+  auto max_dims_num = device_->getInfo<CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS>();
+  LOG(INFO) << "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:" << max_dims_num;
+  device_info_["CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS"] = max_dims_num;
+
+  auto max_work_item_sizes = device_->getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
+  for (size_t i = 0; i < max_work_item_sizes.size(); ++i) {
+    LOG(INFO) << "max_work_item_sizes[" << i << "]:" << max_work_item_sizes[i];
+    std::string dim_key = "CL_DEVICE_MAX_WORK_ITEM_SIZES_" + std::to_string(i);
+    device_info_[dim_key] = max_work_item_sizes[i];
+  }
+
+  // ===================== BUFFER =====================
+  // CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
+  auto max_constant_buffer_size_kb =
+      static_cast<float>(
+          device_->getInfo<CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE>()) /
+      1024;
+  LOG(INFO) << "CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:"
+            << max_constant_buffer_size_kb;
+  device_info_["CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE"] =
+      max_constant_buffer_size_kb;
+
+  // ===================== IMAGE =====================
+  // CL_DEVICE_IMAGE_SUPPORT
+  // CL_DEVICE_IMAGE2D_MAX_HEIGHT
+  // CL_DEVICE_IMAGE2D_MAX_WIDTH
   auto image_support = device_->getInfo<CL_DEVICE_IMAGE_SUPPORT>();
   if (image_support) {
     LOG(INFO) << "The chosen device supports image processing.";
+    device_info_["CL_DEVICE_IMAGE_SUPPORT"] = 1;
   } else {
     LOG(INFO) << "The chosen device doesn't support image processing!";
+    device_info_["CL_DEVICE_IMAGE_SUPPORT"] = 0;
     return false;
   }
+
+  auto image2d_max_height = device_->getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
+  LOG(INFO) << "CL_DEVICE_IMAGE2D_MAX_HEIGHT:" << image2d_max_height;
+  device_info_["CL_DEVICE_IMAGE2D_MAX_HEIGHT"] = image2d_max_height;
+
+  auto image2d_max_width = device_->getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
+  LOG(INFO) << "CL_DEVICE_IMAGE2D_MAX_WIDTH:" << image2d_max_width;
+  device_info_["CL_DEVICE_IMAGE2D_MAX_WIDTH"] = image2d_max_width;
+
+  // ===================== OTHERS / EXTENSION / VERSION =====================
+  // CL_DEVICE_EXTENSIONS
+  // CL_DEVICE_ADDRESS_BITS
   auto ext_data = device_->getInfo<CL_DEVICE_EXTENSIONS>();
   VLOG(4) << "The extensions supported by this device: " << ext_data;
   if (ext_data.find("cl_khr_fp16") != std::string::npos) {
     LOG(INFO) << "The chosen device supports the half data type.";
+    device_info_["CL_DEVICE_EXTENSIONS_FP16"] = 1;
   } else {
     LOG(INFO) << "The chosen device doesn't support the half data type!";
+    device_info_["CL_DEVICE_EXTENSIONS_FP16"] = 0;
   }
-  auto max_units = device_->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
-  LOG(INFO) << "The chosen device has " << max_units << " compute units.";
-  auto local_mem = device_->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
-  LOG(INFO) << "The local memory size of the chosen device is "
-            << static_cast<float>(local_mem) / 1024 << " KB.";
+
+  auto address_bits = device_->getInfo<CL_DEVICE_ADDRESS_BITS>();
+  LOG(INFO) << "CL_DEVICE_ADDRESS_BITS:" << address_bits;
+  device_info_["CL_DEVICE_ADDRESS_BITS"] = address_bits;
+
+  auto driver_version = device_->getInfo<CL_DRIVER_VERSION>();
+  LOG(INFO) << "CL_DRIVER_VERSION:" << driver_version;
+
   return true;
 }
 
+std::map<std::string, size_t>& CLRuntime::GetDeviceInfo() {
+  if (0 != device_info_.size()) {
+    return device_info_;
+  }
+  InitializeDevice();
+  return device_info_;
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h
index 0859780c69cc8647c1fd54bf1ab12be29217c9e1..1a5ededeff37d9f6820af6a49dc22c669620734b 100644
--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <fstream>
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -24,6 +25,9 @@ limitations under the License. */
 namespace paddle {
 namespace lite {
 
+extern const std::map<std::string, std::vector<unsigned char>>
+    opencl_kernels_files;
+
 class CLRuntime {
  public:
   static CLRuntime* Global();
@@ -51,6 +55,8 @@ class CLRuntime {
 
   void set_cl_path(std::string cl_path) { cl_path_ = cl_path; }
 
+  std::map<std::string, size_t>& GetDeviceInfo();
+
  private:
   CLRuntime() = default;
 
@@ -80,6 +86,8 @@ class CLRuntime {
     return queue;
   }
 
+  std::map<std::string, size_t> device_info_;
+
   std::string cl_path_;
 
   std::shared_ptr<cl::Platform> platform_{nullptr};
diff --git a/lite/backends/opencl/target_wrapper.cc b/lite/backends/opencl/target_wrapper.cc
index 310567baa539697f6a67b59f6c0e5f29ce46a80e..9cf07dfc0c474b0b5c57b8355c099eba15610a91 100644
--- a/lite/backends/opencl/target_wrapper.cc
+++ b/lite/backends/opencl/target_wrapper.cc
@@ -81,10 +81,10 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
   return cl_image;
 }
 
-template <>  // use int16_t represents half float
-void *TargetWrapperCL::MallocImage<int16_t>(const size_t cl_image2d_width,
-                                            const size_t cl_image2d_height,
-                                            void *host_ptr) {
+template <>  // use uint16_t represents half float
+void *TargetWrapperCL::MallocImage<uint16_t>(const size_t cl_image2d_width,
+                                             const size_t cl_image2d_height,
+                                             void *host_ptr) {
   cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kFP16)));
   cl_int status;
   cl::Image2D *cl_image =
diff --git a/lite/backends/x86/jit/gen/blas.h b/lite/backends/x86/jit/gen/blas.h
index 39920195b245e1c44ff68ab91af94d25c949bd02..4317d558c6252e9163bc545cba4859fbcb89f804 100644
--- a/lite/backends/x86/jit/gen/blas.h
+++ b/lite/backends/x86/jit/gen/blas.h
@@ -17,6 +17,7 @@
 #include <string>
 #include "glog/logging.h"
 #include "lite/backends/x86/jit/gen/jitcode.h"
+#include "lite/utils/string.h"
 
 namespace paddle {
 namespace lite {
@@ -64,7 +65,7 @@ class VXXJitCode : public JitCode {
       base += "_Vec";
     }
     base += (with_relu_ ? "_Relu" : "");
-    base += "_D" + std::to_string(num_);
+    base += "_D" + paddle::lite::to_string(num_);
     return base;
   }
   void genCode() override;
diff --git a/lite/backends/x86/jit/gen/embseqpool.h b/lite/backends/x86/jit/gen/embseqpool.h
index 7cae76f9dd99cf904e831b196bd493623ff7eb1d..999960ece4170d561419ad24bd94c512ce167eb0 100644
--- a/lite/backends/x86/jit/gen/embseqpool.h
+++ b/lite/backends/x86/jit/gen/embseqpool.h
@@ -47,7 +47,7 @@ class EmbSeqPoolJitCode : public JitCode {
     } else if (type_ == SeqPoolType::kSqrt) {
       base += "_Sqrt";
     }
-    base += ("_W" + std::to_string(tbl_w_));
+    base += ("_W" + paddle::lite::to_string(tbl_w_));
     return base;
   }
   void genCode() override;
diff --git a/lite/backends/x86/jit/gen/matmul.h b/lite/backends/x86/jit/gen/matmul.h
index b1b302b7904a5d92952f4385c483eccdc5df3592..e7be6750cf0d232b41d3be61001eb0af4c52a129 100644
--- a/lite/backends/x86/jit/gen/matmul.h
+++ b/lite/backends/x86/jit/gen/matmul.h
@@ -38,8 +38,8 @@ class MatMulJitCode : public JitCode {
 
   std::string name() const override {
     std::string base = "MatMulJitCode";
-    base = base + "_M" + std::to_string(m_) + "_N" + std::to_string(n_) + "_K" +
-           std::to_string(k_);
+    base = base + "_M" + paddle::lite::to_string(m_) + "_N" +
+           paddle::lite::to_string(n_) + "_K" + paddle::lite::to_string(k_);
     return base;
   }
   void genCode() override;
diff --git a/lite/backends/x86/jit/gen/seqpool.h b/lite/backends/x86/jit/gen/seqpool.h
index 346179cfbbd0e8291dc17b266366c5df07114b7f..60e27993057b58eb8a4a07fcd0a368fc0a9441fc 100644
--- a/lite/backends/x86/jit/gen/seqpool.h
+++ b/lite/backends/x86/jit/gen/seqpool.h
@@ -47,7 +47,7 @@ class SeqPoolJitCode : public JitCode {
     } else if (type_ == SeqPoolType::kSqrt) {
       base += "_Sqrt";
     }
-    base += ("_W" + std::to_string(w_));
+    base += ("_W" + paddle::lite::to_string(w_));
     return base;
   }
   void genCode() override;
diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt
index 1d0558451fce67433d966d1f4bff82af26459e33..db8bc29d70d4764f14f24915fcbc254ba2af91df 100644
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -34,9 +34,9 @@ lite_cc_library(scope SRCS scope.cc DEPS tensor)
 lite_cc_library(device_info SRCS device_info.cc DEPS tensor)
 
 if (LITE_WITH_ARM)
-lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context gflags)
+lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context)
 else()
-lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags)
+lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context)
 endif()
 
 #-------------------------------------------- GET CODE META INFO ------------------------------------------
@@ -67,6 +67,13 @@ message(STATUS "commit: ${PADDLE_LITE_COMMIT}")
 
 configure_file(version.h.in version.h)
 #----------------------------------------------- NOT CHANGE -----------------------------------------------
+# A trick to generate the opencl_kernels_source.cc
+#add_custom_command(
+#  COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/gen_opencl_code.py
+#  ${CMAKE_SOURCE_DIR}/lite/backends/opencl/cl_kernel
+#  ${CMAKE_BINARY_DIR}/lite/backends/opencl/opencl_kernels_source.cc
+#  OUTPUT opencl_kernels_source.cc # not a real path to the output to force it execute every time.
+#  )
 # A trick to generate the paddle_use_kernels.h
 add_custom_command(
   COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_kernel_registry.py
@@ -86,9 +93,13 @@ add_custom_command(
   OUTPUT ops.h # not a real path to the output to force it execute every time.
   )
 # generate fake kernels for memory_optimize_tool
+
+#-------------------------------opt----------------------------------------------------------------
+# tricks to create headfiles for opt
 add_custom_command(
   COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/create_fake_kernel_registry.py
   ${kernels_src_list}
+  ${fake_kernels_src_list}
   ${CMAKE_BINARY_DIR}/all_kernel_faked.cc
   ${CMAKE_BINARY_DIR}/kernel_src_map.h
   OUTPUT all_kernel_faked.cc # not a real path to the output to force it execute every time.
@@ -96,10 +107,12 @@ add_custom_command(
 add_custom_target(op_list_h DEPENDS ops.h)
 add_custom_target(kernel_list_h DEPENDS kernels.h)
 add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc)
+
 # create headfile to restore ops info sorted by suppported platforms
 add_custom_command(
   COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/record_supported_kernel_op.py
   ${kernels_src_list}
+  ${fake_kernels_src_list}
   ${ops_src_list}
   ${CMAKE_BINARY_DIR}/supported_kernel_op_info.h
   OUTPUT supported_kernel_op_info.h # not a real path to the output to force it execute every time.
diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc
index fe36f1e1ba16ad85c44136b09a0d2e5d3fadf688..614ee990a9811ab74ceedb4fa000fa385698d679 100644
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -59,6 +59,8 @@ void TestCase::CreateInstruction() {
   CHECK(it != kernels.end()) << "failed to create the kernel in "
                              << place_.DebugString()
                              << " with alias: " << alias_;
+  // reset final place
+  place_ = (*it)->place();
   // prepare context
   (*it)->SetContext(std::move(ctx_));
   instruction_.reset(new Instruction(op, std::move(*it)));
@@ -74,25 +76,164 @@ void TestCase::PrepareInputsForInstruction() {
       const auto* param_type = ParamTypeRegistry::Global().RetrieveInArgument(
           place_, kernel_key, arg);
 
-      const auto* inst_type = Type::GetTensorTy(TARGET(kHost));
+      const Type* inst_type = nullptr;
+      if (param_type->type->IsTensor()) {
+        inst_type = Type::GetTensorTy(TARGET(kHost));
+      } else if (param_type->type->IsTensorList()) {
+        inst_type = Type::GetTensorListTy(TARGET(kHost));
+      } else {
+        LOG(FATAL) << "unsupported param_type";
+      }
+
       CHECK(scope_->FindVar(var));
-      const auto* shared_tensor = scope_->FindTensor((var));
       if (!TargetCompatibleTo(*inst_type, *param_type->type)) {
-        /// Create a tensor in the instruction's scope, alloc memory and then
-        /// copy data there.
-        auto* target_tensor = inst_scope_->NewTensor(var);
-        CHECK(!shared_tensor->dims().empty()) << "shared_tensor is empty yet";
-        target_tensor->Resize(shared_tensor->dims());
-        TargetCopy(param_type->type->target(),
-                   target_tensor->mutable_data(param_type->type->target(),
-                                               shared_tensor->memory_size()),
-                   shared_tensor->raw_data(),
-                   shared_tensor->memory_size());
+        /// Create a tensor or tensor_array in the instruction's scope,
+        /// alloc memory and then copy data there.
+        if (param_type->type->IsTensor()) {
+          const auto* shared_tensor = scope_->FindTensor(var);
+          auto* target_tensor = inst_scope_->NewTensor(var);
+          CHECK(!shared_tensor->dims().empty()) << "shared_tensor is empty yet";
+          target_tensor->Resize(shared_tensor->dims());
+          TargetCopy(param_type->type->target(),
+                     target_tensor->mutable_data(param_type->type->target(),
+                                                 shared_tensor->memory_size()),
+                     shared_tensor->raw_data(),
+                     shared_tensor->memory_size());
+        } else if (param_type->type->IsTensorList()) {
+          const auto* shared_tensor_array =
+              scope_->FindVar(var)->GetMutable<std::vector<Tensor>>();
+          auto* target_tensor_array =
+              inst_scope_->Var(var)->GetMutable<std::vector<Tensor>>();
+          CHECK(!shared_tensor_array->empty())
+              << "shared_tensor_array is empty yet";
+          target_tensor_array->resize(shared_tensor_array->size());
+          for (int i = 0; i < shared_tensor_array->size(); i++) {
+            target_tensor_array->at(i).Resize(
+                shared_tensor_array->at(i).dims());
+            TargetCopy(param_type->type->target(),
+                       target_tensor_array->at(i).mutable_data(
+                           param_type->type->target(),
+                           shared_tensor_array->at(i).memory_size()),
+                       shared_tensor_array->at(i).raw_data(),
+                       shared_tensor_array->at(i).memory_size());
+          }
+        } else {
+          LOG(FATAL) << "not support";
+        }
       }
     }
   }
 }
 
+template <typename T>
+bool TestCase::CheckTensorPrecision(const Tensor* a_tensor,
+                                    const Tensor* b_tensor,
+                                    float abs_error) {
+  CHECK(a_tensor);
+  CHECK(b_tensor);
+
+  CHECK(ShapeEquals(a_tensor->dims(), b_tensor->dims()));
+
+  CHECK(a_tensor->lod() == b_tensor->lod()) << "lod not match";
+
+  // The baseline should output in host devices.
+  CHECK(b_tensor->target() == TARGET(kHost) ||
+        b_tensor->target() == TARGET(kX86) ||
+        b_tensor->target() == TARGET(kARM));
+
+  const T* a_data{};
+  switch (a_tensor->target()) {
+    case TARGET(kX86):
+    case TARGET(kHost):
+    case TARGET(kARM):
+      a_data = static_cast<const T*>(a_tensor->raw_data());
+      break;
+
+    default:
+      // Before compare, need to copy data from `target` device to host.
+      LOG(FATAL) << "Not supported";
+  }
+
+  CHECK(a_data);
+
+  const T* b_data = static_cast<const T*>(b_tensor->raw_data());
+
+  bool success = true;
+  for (int i = 0; i < a_tensor->dims().production(); i++) {
+    EXPECT_NEAR(a_data[i], b_data[i], abs_error);
+    if (fabsf(a_data[i] - b_data[i]) > abs_error) {
+      success = false;
+    }
+  }
+  return success;
+}
+
+bool TestCase::CheckPrecision(const Tensor* a_tensor,
+                              const Tensor* b_tensor,
+                              float abs_error,
+                              PrecisionType precision_type) {
+  PrecisionType precision_type_t = precision_type;
+  if (precision_type == PRECISION(kAny)) {
+    precision_type_t = b_tensor->precision();
+  }
+  CHECK(precision_type_t == b_tensor->precision())
+      << "arg precision type and base tensor precision type are not matched! "
+         "arg precision type is: "
+      << PrecisionToStr(precision_type) << ", base tensor precision type is: "
+      << PrecisionToStr(b_tensor->precision());
+  CHECK(a_tensor->precision() == b_tensor->precision())
+      << "real tensor precision type and base tensor precision type are not "
+         "matched! real tensor precision type is: "
+      << PrecisionToStr(a_tensor->precision())
+      << ", base tensor precision type is: "
+      << PrecisionToStr(b_tensor->precision());
+  switch (precision_type_t) {
+    case PRECISION(kFloat):
+      return CheckTensorPrecision<float>(a_tensor, b_tensor, abs_error);
+    case PRECISION(kInt8):
+      return CheckTensorPrecision<int8_t>(a_tensor, b_tensor, abs_error);
+    case PRECISION(kInt32):
+      return CheckTensorPrecision<int32_t>(a_tensor, b_tensor, abs_error);
+    case PRECISION(kInt64):
+      return CheckTensorPrecision<int64_t>(a_tensor, b_tensor, abs_error);
+    case PRECISION(kBool):
+      return CheckTensorPrecision<bool>(a_tensor, b_tensor, abs_error);
+    default:
+      LOG(FATAL) << "not support type: " << PrecisionToStr(precision_type);
+      return false;
+  }
+}
+
+bool TestCase::CheckPrecision(const std::string& var_name,
+                              float abs_error,
+                              PrecisionType precision_type) {
+  bool success = true;
+  if (inst_scope_->FindVar(var_name)->IsType<Tensor>()) {
+    auto a_tensor = inst_scope_->FindTensor(var_name);
+    auto b_tensor = base_scope_->FindTensor(var_name);
+    success = success &&
+              CheckPrecision(a_tensor, b_tensor, abs_error, precision_type);
+  } else if (inst_scope_->FindVar(var_name)->IsType<std::vector<Tensor>>()) {
+    auto a_tensor_array =
+        inst_scope_->FindVar(var_name)->GetMutable<std::vector<Tensor>>();
+    auto b_tensor_array =
+        base_scope_->FindVar(var_name)->GetMutable<std::vector<Tensor>>();
+    CHECK_EQ(a_tensor_array->size(), b_tensor_array->size());
+    for (int i = 0; i < a_tensor_array->size(); i++) {
+      Tensor* a_tensor = &(a_tensor_array->at(i));
+      Tensor* b_tensor = &(b_tensor_array->at(i));
+      if (a_tensor->dims().size() == 0 && b_tensor->dims().size() == 0) {
+        continue;
+      }
+      success = success &&
+                CheckPrecision(a_tensor, b_tensor, abs_error, precision_type);
+    }
+  } else {
+    LOG(FATAL) << "unsupported var type";
+  }
+  return success;
+}
+
 TestCase::~TestCase() {
   if (op_desc_->Type() == "subgraph") {
     // Release the subblock desc of Subgraph op
diff --git a/lite/core/arena/framework.h b/lite/core/arena/framework.h
index 85edda26e6591bada967165317de00b169a2d0cd..7050355fbfae55b9ba626119cd95f8e952c27430 100644
--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
@@ -66,11 +66,24 @@ class TestCase {
   /// output.
   virtual void RunBaseline(Scope* scope) = 0;
 
-  /// Check the precision of the output tensors. It will compare the same tensor
-  /// in two scopes, one of the instruction execution, and the other for the
-  /// baseline.
+  // checkout the precision of the two tensors with type T. b_tensor is baseline
   template <typename T>
-  bool CheckPrecision(const std::string& var_name, float abs_error);
+  bool CheckTensorPrecision(const Tensor* a_tensor,
+                            const Tensor* b_tensor,
+                            float abs_error);
+
+  // checkout the precision of the two tensors. b_tensor is baseline
+  bool CheckPrecision(const Tensor* a_tensor,
+                      const Tensor* b_tensor,
+                      float abs_error,
+                      PrecisionType precision_type);
+
+  /// Check the precision of the output variables. It will compare the same
+  /// tensor (or all tensors of the tensor_array) in two scopes, one of the
+  /// instruction execution, and the other for the baseline.
+  bool CheckPrecision(const std::string& var_name,
+                      float abs_error,
+                      PrecisionType precision_type);
 
   const cpp::OpDesc& op_desc() { return *op_desc_; }
 
@@ -78,20 +91,6 @@ class TestCase {
   // kernel registry.
   void CheckKernelConsistWithDefinition() {}
 
-  // Get the real precision of the output for check precision. When the declare
-  // precision obtained from the kernel is any, we should set the precision of
-  // the output in test case.
-  bool GetPrecisonType(const std::string& var_name,
-                       PrecisionType* precision_type) {
-    auto res = precision_type_map_.find(var_name);
-    if (res == precision_type_map_.end()) {
-      return false;
-    } else {
-      *precision_type = precision_type_map_.at(var_name);
-      return true;
-    }
-  }
-
   Scope& scope() { return *scope_; }
 
   Scope* baseline_scope() { return base_scope_; }
@@ -120,22 +119,37 @@ class TestCase {
     tensor->set_persistable(is_persistable);
   }
 
-  // Prepare for the operator.
-  virtual void PrepareOpDesc(cpp::OpDesc* op_desc) = 0;
+  /// Prepare a tensor_array in host. The tensors will be created in scope_.
+  /// Need to specify the targets other than X86 or ARM.
+  template <typename T>
+  void SetCommonTensorList(const std::string& var_name,
+                           const std::vector<DDim>& array_tensor_dims,
+                           const std::vector<std::vector<T>>& datas,
+                           const std::vector<LoD>& lods = {}) {
+    CHECK_EQ(array_tensor_dims.size(), datas.size());
+    if (!lods.empty()) {
+      CHECK_EQ(array_tensor_dims.size(), lods.size());
+    }
 
-  // Set the real precision of the output for check precision. When the declare
-  // precision obtained from the kernel is any, we should set the precision of
-  // the output in test case.
-  void SetPrecisionType(const std::string& var_name,
-                        const PrecisionType& precision_type) {
-    auto res = precision_type_map_.find(var_name);
-    if (res == precision_type_map_.end()) {
-      precision_type_map_.insert({var_name, precision_type});
-    } else {
-      precision_type_map_.at(var_name) = precision_type;
+    auto* tensor_array =
+        scope_->Var(var_name)->GetMutable<std::vector<Tensor>>();
+    for (int i = 0; i < array_tensor_dims.size(); i++) {
+      Tensor tmp;
+      tmp.Resize(array_tensor_dims[i]);
+      auto* tmp_data = tmp.mutable_data<T>();
+      memcpy(tmp_data,
+             datas[i].data(),
+             array_tensor_dims[i].production() * sizeof(T));
+      if (!lods.empty()) {
+        tmp.set_lod(lods[i]);
+      }
+      tensor_array->push_back(tmp);
     }
   }
 
+  // Prepare for the operator.
+  virtual void PrepareOpDesc(cpp::OpDesc* op_desc) = 0;
+
  public:
   const Instruction& instruction() { return *instruction_; }
 
@@ -179,7 +193,6 @@ class TestCase {
   Scope* base_scope_{};
   std::unique_ptr<cpp::OpDesc> op_desc_;
   std::unique_ptr<Instruction> instruction_;
-  std::unordered_map<std::string, PrecisionType> precision_type_map_;
 };
 
 class Arena {
@@ -236,22 +249,7 @@ class Arena {
     const Type* type =
         tester_->instruction().kernel()->GetOutputDeclType(arg_name);
     auto precision_type = type->precision();
-    if (precision_type == PRECISION(kAny)) {
-      CHECK(tester_->GetPrecisonType(var_name, &precision_type));
-    }
-    switch (precision_type) {
-      case PRECISION(kFloat):
-        return tester_->CheckPrecision<float>(var_name, abs_error_);
-      case PRECISION(kInt8):
-        return tester_->CheckPrecision<int8_t>(var_name, abs_error_);
-      case PRECISION(kInt32):
-        return tester_->CheckPrecision<int32_t>(var_name, abs_error_);
-      case PRECISION(kBool):
-        return tester_->CheckPrecision<bool>(var_name, abs_error_);
-      default:
-        LOG(FATAL) << "not support type " << PrecisionToStr(type->precision());
-        return false;
-    }
+    return tester_->CheckPrecision(var_name, abs_error_, precision_type);
   }
 
  private:
@@ -260,49 +258,6 @@ class Arena {
   float abs_error_;
 };
 
-template <typename T>
-bool TestCase::CheckPrecision(const std::string& var_name, float abs_error) {
-  auto a_tensor = inst_scope_->FindTensor(var_name);
-  auto b_tensor = base_scope_->FindTensor(var_name);
-  CHECK(a_tensor);
-  CHECK(b_tensor);
-
-  CHECK(ShapeEquals(a_tensor->dims(), b_tensor->dims()));
-
-  CHECK(a_tensor->lod() == b_tensor->lod()) << "lod not match";
-
-  // The baseline should output in host devices.
-  CHECK(b_tensor->target() == TARGET(kHost) ||
-        b_tensor->target() == TARGET(kX86) ||
-        b_tensor->target() == TARGET(kARM));
-
-  const T* a_data{};
-  switch (a_tensor->target()) {
-    case TARGET(kX86):
-    case TARGET(kHost):
-    case TARGET(kARM):
-      a_data = static_cast<const T*>(a_tensor->raw_data());
-      break;
-
-    default:
-      // Before compare, need to copy data from `target` device to host.
-      LOG(FATAL) << "Not supported";
-  }
-
-  CHECK(a_data);
-
-  const T* b_data = static_cast<const T*>(b_tensor->raw_data());
-
-  bool success = true;
-  for (int i = 0; i < a_tensor->dims().production(); i++) {
-    EXPECT_NEAR(a_data[i], b_data[i], abs_error);
-    if (fabsf(a_data[i] - b_data[i]) > abs_error) {
-      success = false;
-    }
-  }
-  return success;
-}
-
 }  // namespace arena
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/context.cc b/lite/core/context.cc
index 948aac0c794969304b585520bfb7229410555578..be886168e02e21d192305d701110ce5075ffba63 100644
--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -14,10 +14,6 @@
 
 #include "lite/core/context.h"
 
-#ifdef LITE_WITH_OPENCL
-DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path.");
-#endif
-
 namespace paddle {
 namespace lite {}  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/context.h b/lite/core/context.h
index 653329e4f24b1f391ea41ed39819b60c8a598a3b..88fe00d0f2aab41cfd3e5562d29f0a8a82598428 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -20,7 +20,6 @@
 #include "lite/backends/cuda/cuda_utils.h"
 #endif
 #ifdef LITE_WITH_OPENCL
-#include <gflags/gflags.h>
 #include <unordered_map>
 #include "lite/backends/opencl/cl_context.h"
 #include "lite/backends/opencl/cl_runtime.h"
@@ -36,10 +35,7 @@
 #include "lite/core/target_wrapper.h"
 #include "lite/core/tensor.h"
 #include "lite/utils/all.h"
-
-#ifdef LITE_WITH_OPENCL
-DECLARE_string(cl_path);
-#endif
+#include "lite/utils/env.h"
 
 namespace paddle {
 namespace lite {
@@ -56,6 +52,7 @@ using XPUContext = Context<TargetType::kXPU>;
 using OpenCLContext = Context<TargetType::kOpenCL>;
 using FPGAContext = Context<TargetType::kFPGA>;
 using BMContext = Context<TargetType::kBM>;
+using MLUContext = Context<TargetType::kMLU>;
 
 template <>
 class Context<TargetType::kHost> {
@@ -304,7 +301,6 @@ class Context<TargetType::kOpenCL> {
   void InitOnce() {
     // Init cl runtime.
     CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed";
-    CLRuntime::Global()->set_cl_path(FLAGS_cl_path);
 
     cl_context_ = std::make_shared<CLContext>();
     cl_wait_list_ = std::make_shared<WaitListType>();
@@ -400,7 +396,7 @@ class ContextScheduler {
         break;
 #endif
       default:
-#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL
+#if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON)
         LOG(FATAL) << "unsupported target " << TargetToStr(target);
 #endif
         break;
diff --git a/lite/core/lite.map b/lite/core/lite.map
index 9cfd272eb6d3017a75b40481d25527d7c14478bf..406f578fab545709b90939cdfe475a8620be6841 100644
--- a/lite/core/lite.map
+++ b/lite/core/lite.map
@@ -1,6 +1,6 @@
 {
     global:
-        *paddle*;
+        *paddle*lite*;
         *touch_*;
         *mir_pass_*;
     local:
diff --git a/lite/core/lite_tensor_test.cc b/lite/core/lite_tensor_test.cc
index d667a9f8852d49bd850274bbb3c895e14d233f77..500dae3e283084ff8218fc758e1a7c5119eff16b 100644
--- a/lite/core/lite_tensor_test.cc
+++ b/lite/core/lite_tensor_test.cc
@@ -13,19 +13,49 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+#include <cstring>
 #include "lite/core/tensor.h"
 
 namespace paddle {
 namespace lite {
 
-TEST(tensor, test) {
-  TensorLite tensor;
-  DDimLite ddim({1, 8});
-  tensor.Resize(ddim);
+template <typename Dtype, TargetType Target>
+void test_shared_memory_tensor() {
+  const std::vector<Dtype> data({0, 1, 2, 3});
+  const std::vector<int64_t> shape({2, 2});
+  const size_t size = data.size() * sizeof(Dtype);
+  TensorLite init_tensor;
+  init_tensor.Assign<Dtype, DDim, Target>(data.data(),
+                                          static_cast<DDim>(shape));
+  Dtype* init_raw_data = init_tensor.mutable_data<Dtype>();
 
-  for (int i = 0; i < 8; i++) {
-    tensor.mutable_data<int>()[i] = i;
+  TensorLite shared_tensor(
+      std::make_shared<Buffer>(Buffer(init_raw_data, Target, size)));
+  Buffer host_buffer;
+  host_buffer.ResetLazy(TargetType::kHost, size);
+  if (Target == TargetType::kHost) {
+    CopySync<Target>(
+        host_buffer.data(), init_raw_data, size, IoDirection::HtoH);
+  } else {
+    CopySync<Target>(
+        host_buffer.data(), init_raw_data, size, IoDirection::DtoH);
   }
+  EXPECT_EQ(std::memcmp(host_buffer.data(), data.data(), size), 0);
+
+  shared_tensor.Resize({1, 5});
+  ASSERT_DEATH(shared_tensor.mutable_data<Dtype>(), "");
+}
+
+TEST(tensor, shared_memory) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  test_shared_memory_tensor<float, TargetType::kHost>();
+  test_shared_memory_tensor<int64_t, TargetType::kHost>();
+  test_shared_memory_tensor<int8_t, TargetType::kHost>();
+#ifdef LITE_WITH_CUDA
+  test_shared_memory_tensor<float, TargetType::kCUDA>();
+  test_shared_memory_tensor<int64_t, TargetType::kCUDA>();
+  test_shared_memory_tensor<int8_t, TargetType::kCUDA>();
+#endif
 }
 
 }  // namespace lite
diff --git a/lite/core/memory.cc b/lite/core/memory.cc
index cfb0b3ae1765864200ecf2d70107a3aa0046899c..0ee973a8b6412a2fd20e33745b7b86561696efae 100644
--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
@@ -51,7 +51,7 @@ void* TargetMalloc(TargetType target, size_t size) {
   return data;
 }
 
-void TargetFree(TargetType target, void* data) {
+void TargetFree(TargetType target, void* data, std::string free_flag) {
   switch (target) {
     case TargetType::kHost:
     case TargetType::kX86:
@@ -66,7 +66,11 @@ void TargetFree(TargetType target, void* data) {
 #endif  // LITE_WITH_CUDA
 #ifdef LITE_WITH_OPENCL
     case TargetType::kOpenCL:
-      TargetWrapperCL::Free(data);
+      if (free_flag == "cl_use_image2d_") {
+        TargetWrapperCL::FreeImage(data);
+      } else {
+        TargetWrapperCL::Free(data);
+      }
       break;
 #endif  // LITE_WITH_OPENCL
 #ifdef LITE_WITH_FPGA
diff --git a/lite/core/memory.h b/lite/core/memory.h
index 051d47bdde102f5fe058163d0c746fe3c4acf26e..691415aecb53bf7f48faf5fbb4dbca448da04a10 100644
--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #pragma once
+#include <string>
 #include "lite/api/paddle_place.h"
 #include "lite/core/target_wrapper.h"
+#include "lite/utils/logging.h"
 #include "lite/utils/macros.h"
 
 #ifdef LITE_WITH_OPENCL
@@ -38,7 +40,9 @@ LITE_API void* TargetMalloc(TargetType target, size_t size);
 
 // Free memory for a specific Target. All the targets should be an element in
 // the `switch` here.
-void LITE_API TargetFree(TargetType target, void* data);
+void LITE_API TargetFree(TargetType target,
+                         void* data,
+                         std::string free_flag = "");
 
 // Copy a buffer from host to another target.
 void TargetCopy(TargetType target, void* dst, const void* src, size_t size);
@@ -81,6 +85,9 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
       TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, dir);
       break;
 #endif
+    default:
+      LOG(FATAL)
+          << "The copy function of this target has not been implemented yet.";
   }
 }
 
@@ -89,17 +96,24 @@ class Buffer {
  public:
   Buffer() = default;
   Buffer(TargetType target, size_t size) : space_(size), target_(target) {}
+  Buffer(void* data, TargetType target, size_t size)
+      : space_(size), data_(data), own_data_(false), target_(target) {}
 
   void* data() const { return data_; }
   TargetType target() const { return target_; }
   size_t space() const { return space_; }
+  bool own_data() const { return own_data_; }
 
   void ResetLazy(TargetType target, size_t size) {
     if (target != target_ || space_ < size) {
+      CHECK_EQ(own_data_, true) << "Can not reset unowned buffer.";
       Free();
       data_ = TargetMalloc(target, size);
       target_ = target;
       space_ = size;
+#ifdef LITE_WITH_OPENCL
+      cl_use_image2d_ = false;
+#endif
     }
   }
 
@@ -111,14 +125,15 @@ class Buffer {
                         const size_t img_w,
                         const size_t img_h,
                         void* host_ptr = nullptr) {
-    size_t size = sizeof(T) * img_w * img_h *
-                  4;  // 4 for RGBA, un-used for opencl Image2D
     if (target != target_ || cl_image2d_width_ < img_w ||
         cl_image2d_height_ < img_h) {
+      CHECK_EQ(own_data_, true) << "Can not reset unowned buffer.";
       Free();
       data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h, host_ptr);
       target_ = target;
-      space_ = size;  // un-used for opencl Image2D
+      space_ = sizeof(T) * img_w * img_h *
+               4;  // un-used for opencl Image2D, 4 for RGBA,
+      cl_use_image2d_ = true;
       cl_image2d_width_ = img_w;
       cl_image2d_height_ = img_h;
     }
@@ -126,8 +141,12 @@ class Buffer {
 #endif
 
   void Free() {
-    if (space_ > 0) {
-      TargetFree(target_, data_);
+    if (space_ > 0 && own_data_) {
+      if (!cl_use_image2d_) {
+        TargetFree(target_, data_);
+      } else {
+        TargetFree(target_, data_, "cl_use_image2d_");
+      }
     }
     data_ = nullptr;
     target_ = TargetType::kHost;
@@ -146,9 +165,11 @@ class Buffer {
  private:
   // memory it actually malloced.
   size_t space_{0};
+  bool cl_use_image2d_{false};   // only used for OpenCL Image2D
   size_t cl_image2d_width_{0};   // only used for OpenCL Image2D
   size_t cl_image2d_height_{0};  // only used for OpenCL Image2D
   void* data_{nullptr};
+  bool own_data_{true};
   TargetType target_{TargetType::kHost};
 };
 
diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt
index 379ef67f2996519d0c8007d8f191efbd2166a9e3..82b19b030c35e69ad2a666f93475c556cc51fd23 100644
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -36,6 +36,7 @@ lite_cc_library(mir_passes
       runtime_context_assign_pass.cc
       memory_optimize_pass.cc
       weight_quantization_preprocess_pass.cc
+      quantized_op_attributes_inference_pass.cc
   DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs})
 
 # lite_cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS
diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
index b688bbc1083a6ab0f521381c4a988a12badc3141..68c07c0ffd0694aec0ff073082e1192213a0ef4a 100644
--- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
@@ -24,17 +24,28 @@ namespace mir {
 
 void ConvActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   std::vector<std::string> act_types{"relu"};
+  bool has_int8 = false;
+  bool has_arm_float = false;
+  bool has_cuda = false;
   for (auto& place : graph->valid_places()) {
-    if (place.target == TARGET(kCUDA)) {
-      act_types.push_back("leaky_relu");
-      break;
+    if (place.precision == PRECISION(kInt8)) {
+      has_int8 = true;
     }
     if (place.target == TARGET(kARM) && place.precision == PRECISION(kFloat)) {
-      act_types.push_back("relu6");
-      act_types.push_back("leaky_relu");
-      break;
+      has_arm_float = true;
+    }
+    if (place.target == TARGET(kCUDA)) {
+      has_cuda = true;
     }
   }
+
+  if (!has_int8 && has_arm_float) {
+    act_types.push_back("relu6");
+    act_types.push_back("leaky_relu");
+  }
+  if (!has_int8 && has_cuda) {
+    act_types.push_back("leaky_relu");
+  }
   for (auto conv_type : {"conv2d", "depthwise_conv2d", "conv2d_transpose"}) {
     for (auto act_type : act_types) {
       for (auto has_bias : {true, false}) {
diff --git a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
index ff5a7a1f25239d9dbfc79491bd137804b16b6cfa..ab81f3d809507dd340056c97a39998c908a75dc7 100644
--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
+++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
@@ -45,7 +45,7 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   }
 
   // delete quant_dequant_node
-  for (auto op_type : {"pool2d", "elementwise_add"}) {
+  for (auto op_type : {"pool2d", "softmax", "elementwise_add"}) {
     fusion::DeleteQuantDequantOpFuser fuser(op_type);
     fuser(graph.get());
   }
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
index da611e4490f4ba7268d9011b3dbb391a63a88305..7797864a2e4b75f52fd7da93ea81613a2175f423 100644
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -297,7 +297,7 @@ cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
 void DeleteQuantDequantOpFuser::BuildPattern() {
   std::string quant_dequant_op_type =
       "fake_quantize_dequantize_moving_average_abs_max";
-  if (quantized_op_type_ == "pool2d") {
+  if (quantized_op_type_ == "pool2d" || quantized_op_type_ == "softmax") {
     auto* input_scale_node =
         VarNode("input_scale_node")
             ->assert_is_op_input(quant_dequant_op_type, "InScale");
@@ -374,7 +374,7 @@ void DeleteQuantDequantOpFuser::BuildPattern() {
 
 void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
                                               const key2nodes_t& matched) {
-  if (quantized_op_type_ == "pool2d") {
+  if (quantized_op_type_ == "pool2d" || quantized_op_type_ == "softmax") {
     auto* input_scale_node = matched.at("input_scale_node");
     auto* input_act_node = matched.at("input_act_node");
     auto* quant_dequant_node = matched.at("quant_dequant_node");
diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc
index 3a27360f94d7d828e1c19214d621f1dfe4e048ca..28ec814fa85451b5292bfde6bddc6b64b57b2f08 100644
--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
@@ -18,6 +18,7 @@
 #include <set>
 #include <string>
 #include <utility>
+#include <vector>
 #include "lite/core/mir/pass_registry.h"
 #include "lite/utils/string.h"
 
@@ -28,56 +29,101 @@ namespace mir {
 using inference::analysis::Dot;
 
 void GraphVisualizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  Visualize(graph.get());
+  VLOG(5) << "\n" << Visualize(graph.get());
 }
 
 std::string Visualize(mir::SSAGraph* graph) {
+  std::ostringstream os;
   inference::analysis::Dot dot;
-
-  int id = 0;
-  std::set<std::string> exists_args;
-  for (auto& node : graph->mutable_nodes()) {
-    std::string key;
-    if (node.IsArg()) {
-      key = node.AsArg().name;
-    } else {
-      key = string_format("%s%d", node.AsStmt().op_type().c_str(), id++);
+  auto string_trunc = [](const std::string& str) -> std::string {
+    const int max_disp_size = 100;
+    if (str.length() > max_disp_size)
+      return str.substr(0, max_disp_size) + "...";
+    return str;
+  };
+  auto attr_repr = [&](const OpInfo* op_info,
+                       const std::string& attr_name) -> std::string {
+    std::ostringstream os;
+    using AttrType = cpp::OpDesc::AttrType;
+    auto attr_type = op_info->GetAttrType(attr_name);
+    switch (attr_type) {
+      case AttrType::INT:
+        os << ":int:"
+           << paddle::lite::to_string(op_info->GetAttr<int>(attr_name));
+        break;
+      case AttrType::FLOAT:
+        os << ":float:"
+           << paddle::lite::to_string(op_info->GetAttr<float>(attr_name));
+        break;
+      case AttrType::BOOLEAN:
+        os << ":int:"
+           << paddle::lite::to_string(op_info->GetAttr<bool>(attr_name));
+        break;
+      case AttrType::STRING:
+        os << ":string: \""
+           << string_trunc(op_info->GetAttr<std::string>(attr_name)) << "\"";
+        break;
+      case AttrType::FLOATS: {
+        auto vals = op_info->GetAttr<std::vector<float>>(attr_name);
+        os << ":floats: {" + Join(vals, ",") << "}";
+      } break;
+      case AttrType::INTS: {
+        auto vals = op_info->GetAttr<std::vector<int>>(attr_name);
+        os << ":ints: {" + Join(vals, ",") + "}";
+      } break;
+      case AttrType::STRINGS: {
+        auto vals = op_info->GetAttr<std::vector<std::string>>(attr_name);
+        os << ":strings: {" + string_trunc(Join(vals, ",")) << "}";
+      } break;
+      default:
+        os << ":Unknow type(" << static_cast<int>(attr_type) << ")";
+        break;
     }
-    if (node.IsStmt()) {
-      dot.AddNode(key,
-                  {Dot::Attr("shape", "box"),
-                   Dot::Attr("style", "filled"),
-                   Dot::Attr("color", "black"),
-                   Dot::Attr("fillcolor", "yellow")});
-      for (auto& x : node.inlinks) {
-        auto name = x->AsArg().name;
-        if (!exists_args.count(name)) {
-          dot.AddNode(name, {});
-        }
-        dot.AddEdge(name, key, {});
-        exists_args.insert(name);
+    return os.str();
+  };
+  int op_idx = 0;
+  std::set<std::string> exists_var_names;
+  for (auto& node : graph->StmtTopologicalOrder()) {
+    if (!node->IsStmt()) continue;
+    auto op_info = node->AsStmt().op_info();
+    auto op_type = op_info->Type();
+    std::string op_name = string_format("%s%d", op_type.c_str(), op_idx++);
+    // Add its input&output variables as the Dot nodes
+    dot.AddNode(op_name,
+                {Dot::Attr("shape", "box"),
+                 Dot::Attr("style", "filled"),
+                 Dot::Attr("color", "black"),
+                 Dot::Attr("fillcolor", "yellow")});
+    for (auto& x : node->inlinks) {
+      auto var_name = x->AsArg().name;
+      if (!exists_var_names.count(var_name)) {
+        dot.AddNode(var_name, {});
+        exists_var_names.insert(var_name);
       }
-      for (auto& x : node.outlinks) {
-        auto name = x->AsArg().name;
-        if (!exists_args.count(name)) {
-          dot.AddNode(name, {});
-        }
-        dot.AddEdge(key, name, {});
-        exists_args.insert(name);
+      dot.AddEdge(var_name, op_name, {});
+    }
+    for (auto& x : node->outlinks) {
+      auto var_name = x->AsArg().name;
+      if (!exists_var_names.count(var_name)) {
+        dot.AddNode(var_name, {});
+        exists_var_names.insert(var_name);
       }
+      dot.AddEdge(op_name, var_name, {});
+    }
+    // Output its all of attributes(name and values)
+    os << "* " << op_name << "\n";
+    const auto& attr_names = op_info->AttrNames();
+    for (auto& attr_name : attr_names) {
+      os << " - " << attr_name << attr_repr(op_info, attr_name) << "\n";
     }
   }
-
-  auto res = dot.Build();
-  // If we use VLOG here, we can not type all graph out.
-  // So we change VLOG to std::cout.
-  std::cout << "dot:\n" << res << std::endl;
-  return res;
+  os << dot.Build();
+  return os.str();
 }
 
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_MIR_PASS(graph_visualze, paddle::lite::mir::GraphVisualizePass)
+REGISTER_MIR_PASS(graph_visualize_pass, paddle::lite::mir::GraphVisualizePass)
     .BindTargets({TARGET(kAny)});
diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc
index 6256a49a99b9097664c192d40502daf506437a31..38293ede76ed35bf05767ce1333947b7dfdbc4ac 100644
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -39,52 +39,109 @@ void MemoryOptimizePass::CollectLifeCycleByDevice(
   auto is_host = [](TargetType x) -> bool {
     return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM);
   };
-  // The vars which inputs or outputs are invalid op will not be reused.
-  auto valid_var = [&](Node* node) -> bool {
-    std::set<std::string> invalid_op = {"while",
-                                        "conditional_block",
-                                        "conditional_block_infer",
-                                        "merge_lod_tensor_infer",
-                                        "merge_lod_tensor",
-                                        "equal",
-                                        "lod_reset",
-                                        "concat",
-                                        "yolo_box",
-                                        "subgraph",
-                                        "feed",
-                                        "fetch"};
-    for (auto* tmp : node->inlinks) {
-      CHECK(tmp->IsStmt());
-      std::string op_type = tmp->AsStmt().op_info()->Type();
-      if (std::find(invalid_op.begin(), invalid_op.end(), op_type) !=
-          invalid_op.end()) {
-        return false;
+
+  // The all of input and output variables of the Ops will not be reused.
+  std::unordered_set<std::string> invalid_op_nodes = {"while",
+                                                      "conditional_block",
+                                                      "conditional_block_infer",
+                                                      "merge_lod_tensor_infer",
+                                                      "merge_lod_tensor",
+                                                      "equal",
+                                                      "lod_reset",
+                                                      "concat",
+                                                      "yolo_box",
+                                                      "subgraph",
+                                                      "feed",
+                                                      "fetch"};
+
+  auto insert_invalid_op_nodes_for_specific_target = [&](
+      std::unordered_set<std::string> op_node_set, TargetType specific_target) {
+    std::unordered_set<std::string> invalid_op_nodes_opencl = {"layout", "fc"};
+    for (auto& op_node : graph->StmtTopologicalOrder()) {
+      if (!op_node->IsStmt()) continue;
+      TargetType op_target_type = op_node->AsStmt().place().target;
+      if (op_target_type == specific_target &&
+          specific_target == TARGET(kOpenCL)) {
+        invalid_op_nodes.insert(invalid_op_nodes_opencl.begin(),
+                                invalid_op_nodes_opencl.end());
+        break;
       }
+      // else if // you can add more targets
     }
-    for (auto* tmp : node->outlinks) {
-      CHECK(tmp->IsStmt());
-      std::string op_type = tmp->AsStmt().op_info()->Type();
-      if (std::find(invalid_op.begin(), invalid_op.end(), op_type) !=
-          invalid_op.end()) {
-        return false;
+  };
+
+  VLOG(4) << "invalid_op_nodes.size();" << invalid_op_nodes.size();
+  insert_invalid_op_nodes_for_specific_target(invalid_op_nodes,
+                                              TARGET(kOpenCL));
+  VLOG(4) << "invalid_op_nodes.size();" << invalid_op_nodes.size();
+
+  // Collect the invalid input and output variables that will not be reused.
+  std::unordered_set<std::string> invalid_var_names;
+  for (auto& op_node : graph->StmtTopologicalOrder()) {
+    // variables of invalid_op_nodes wil not be reused
+    if (!op_node->IsStmt()) continue;
+    auto op_info = op_node->AsStmt().op_info();
+    auto op_type = op_info->Type();
+    auto invalid_op_node = invalid_op_nodes.find(op_type);
+    if (invalid_op_node != invalid_op_nodes.end()) {
+      for (auto in_var_node : op_node->inlinks) {
+        CHECK(in_var_node->IsArg());
+        invalid_var_names.insert(in_var_node->AsArg().name);
       }
+      for (auto out_var_node : op_node->outlinks) {
+        CHECK(out_var_node->IsArg());
+        invalid_var_names.insert(out_var_node->AsArg().name);
+      }
+      continue;
     }
-    return true;
-  };
+    // The specified input and output variables of the Ops whose 'inplace' attr
+    // is true will not be reused, such as reshape/reshape2's X and Out
+    // variables
+    std::unordered_map<std::string,
+                       std::pair<std::unordered_set<std::string>,
+                                 std::unordered_set<std::string>>>
+        inplace_op_nodes = {{"reshape", {{"X"}, {"Out"}}},
+                            {"reshape2", {{"X"}, {"Out"}}}};
+    auto inplace_op_node = inplace_op_nodes.find(op_type);
+    if (inplace_op_node != inplace_op_nodes.end()) {
+      bool inplace = false;
+      if (op_info->HasAttr("inplace")) {
+        inplace = op_info->GetAttr<bool>("inplace");
+      }
+      if (inplace) {
+        for (auto& in_param_name : inplace_op_node->second.first) {
+          const auto& in_arg_names = op_info->Input(in_param_name);
+          invalid_var_names.insert(in_arg_names.begin(), in_arg_names.end());
+        }
+        for (auto& out_param_name : inplace_op_node->second.second) {
+          const auto& out_arg_names = op_info->Output(out_param_name);
+          invalid_var_names.insert(out_arg_names.begin(), out_arg_names.end());
+        }
+      }
+    }
+  }
+
+  // non-tensor(like tensor_array) variables will not be reused
+  for (auto& node : graph->nodes()) {
+    if (node.IsArg() && (node.arg()->type != nullptr) &&
+        !node.arg()->type->IsTensor()) {
+      invalid_var_names.insert(node.arg()->name);
+    }
+  }
 
   for (auto& op_node : graph->StmtTopologicalOrder()) {
     if (op_node->IsStmt()) {
-      auto inputs = op_node->inlinks;
-      auto outputs = op_node->outlinks;
-      std::vector<Node*> requires(inputs.begin(), inputs.end());
-      requires.insert(requires.end(), outputs.begin(), outputs.end());
-      for (Node* node : requires) {
-        CHECK(node->IsArg());
-        auto& arg = node->AsArg();
+      std::vector<Node*> var_nodes(op_node->inlinks.begin(),
+                                   op_node->inlinks.end());
+      var_nodes.insert(
+          var_nodes.end(), op_node->outlinks.begin(), op_node->outlinks.end());
+      for (auto* var_node : var_nodes) {
+        CHECK(var_node->IsArg());
+        auto& arg = var_node->AsArg();
         if (arg.is_weight || arg.is_persist) continue;
-        if (!valid_var(node)) continue;
         std::string var_name = arg.name;
-        TargetType target_type = node->AsArg().type->target();
+        if (invalid_var_names.count(var_name)) continue;
+        TargetType target_type = arg.type->target();
         if (is_host(target_type)) target_type = TARGET(kHost);
 
         if (!(*lifecycles)[TargetToStr(target_type)].count(var_name)) {
@@ -181,7 +238,7 @@ void MemoryOptimizePass::PerformReusePlan(
       if (reuse_table.count(name) && reuse_table.at(name) != name) {
         auto replace_name = reuse_table.at(name);
         input_node->AsArg().name =
-            replace_name + "(" + std::to_string(node_append_idx) + ")";
+            replace_name + "(" + paddle::lite::to_string(node_append_idx) + ")";
         node_append_idx++;
       }
     }
@@ -205,7 +262,7 @@ void MemoryOptimizePass::PerformReusePlan(
       if (reuse_table.count(name) && reuse_table.at(name) != name) {
         auto replace_name = reuse_table.at(name);
         out_node->AsArg().name =
-            replace_name + "(" + std::to_string(node_append_idx) + ")";
+            replace_name + "(" + paddle::lite::to_string(node_append_idx) + ")";
         node_append_idx++;
       }
     }
@@ -255,5 +312,5 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 }  // namespace paddle
 
 REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
-    .BindTargets({TARGET(kARM)})
-    .ExcludeTargets({TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU), TARGET(kBM)});
+    .BindTargets({TARGET(kARM), TARGET(kOpenCL)})
+    .ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM)});
diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d6240888d0806486f478511ef81ba8179b46ab43
--- /dev/null
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -0,0 +1,499 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/mlu_postprocess_pass.h"
+#include <list>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/graph_visualize_pass.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/operators/subgraph_op.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
+                                           const std::string& cast_arg_name,
+                                           SSAGraph* graph,
+                                           Node* cur_node,
+                                           Node* inst_node,
+                                           const Type* cast_type) {
+  // create the arg node
+  auto* cast_arg = graph->NewArgumentNode(cast_arg_name);
+  cast_arg->AsArg().type = cast_type;
+  inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
+
+  // create the stmt node
+  auto* cast_inst = graph->NewInstructNode();
+  // create op
+  auto cast_op = LiteOpRegistry::Global().Create(op_type);
+  CHECK(cast_op) << "create op [" << op_type << "] failed";
+  cpp::OpDesc op_desc;
+  op_desc.SetType(op_type);
+  if (op_type == "cast") {
+    op_desc.SetAttr<int>("in_dtype", 5);   // FP32
+    op_desc.SetAttr<int>("out_dtype", 4);  // FP16
+    op_desc.SetInput("X", {cur_node->AsArg().name});
+    op_desc.SetOutput("Out", {cast_arg_name});
+  } else if (op_type == "transpose") {
+    // NCHW -> NHWC
+    op_desc.SetAttr<std::vector<int>>("axis", {0, 2, 3, 1});
+    op_desc.SetInput("X", {cur_node->AsArg().name});
+    op_desc.SetOutput("Out", {cast_arg_name});
+  } else if (op_type == "io_copy") {
+    op_desc.SetInput("Input", {cur_node->AsArg().name});
+    op_desc.SetOutput("Out", {cast_arg_name});
+  } else {
+    CHECK(0) << "Unsupport cast type";
+  }
+  cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
+  // create kernels
+  auto kernels = cast_op->CreateKernels(graph->valid_places());
+  std::vector<std::unique_ptr<KernelBase>> selected_kernels;
+  bool is_found = false;
+  for (auto& kernel : kernels) {
+    if (op_type == "cast") {
+      const Type* in_arg_ty = kernel->GetInputDeclType("X");
+      if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
+        is_found = true;
+      }
+    } else if (op_type == "transpose") {
+      is_found = true;
+    } else if (op_type == "io_copy") {
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+      if (TargetCompatibleTo(*in_arg_ty, *cur_node->AsArg().type) &&
+          TargetCompatibleTo(*out_arg_ty, *cast_type)) {
+        is_found = true;
+      }
+    } else {
+      CHECK(0) << "Unsupport cast type";
+    }
+    if (is_found) {
+      selected_kernels.emplace_back(std::move(kernel));
+      // we pick the kernel
+      cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
+      auto& stmt = cast_inst->AsStmt();
+      stmt.picked_kernel().SetContext(
+          ContextScheduler::Global().NewContext(stmt.picked_kernel().target()));
+      break;
+    }
+  }
+  CHECK(is_found) << "Can't find a Cast kernel for Cast op: "
+                  << cur_node->AsArg().name << "->" << op_type;
+  // modify links
+  DirectedLink(cur_node, cast_inst);
+  DirectedLink(cast_inst, cast_arg);
+  return cast_arg;
+}
+
+Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
+                                          const std::string& cast_arg_name,
+                                          SSAGraph* graph,
+                                          Node* cur_node,
+                                          Node* inst_node,
+                                          const Type* cast_type) {
+  // create the arg node
+  auto* cast_arg = graph->NewArgumentNode(cast_arg_name);
+  cast_arg->AsArg().type = cast_type;
+  auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
+  // for CastAfter manully set the tensor's type
+  var->GetMutable<::paddle::lite::Tensor>();
+
+  // create the stmt node
+  auto* cast_inst = graph->NewInstructNode();
+  // create op
+  auto cast_op = LiteOpRegistry::Global().Create(op_type);
+  CHECK(cast_op) << "create op [" << op_type << "] failed";
+  cpp::OpDesc op_desc;
+  op_desc.SetType(op_type);
+  if (op_type == "cast") {
+    op_desc.SetAttr<int>("in_dtype", 4);   // FP32
+    op_desc.SetAttr<int>("out_dtype", 5);  // FP16
+    op_desc.SetInput("X", {cast_arg_name});
+    op_desc.SetOutput("Out", {cur_node->AsArg().name});
+  } else if (op_type == "transpose") {
+    // NHWC -> NCHW
+    op_desc.SetAttr<std::vector<int>>("axis", {0, 3, 1, 2});
+    op_desc.SetInput("X", {cast_arg_name});
+    op_desc.SetOutput("Out", {cur_node->AsArg().name});
+  } else if (op_type == "io_copy") {
+    op_desc.SetInput("Input", {cast_arg_name});
+    op_desc.SetOutput("Out", {cur_node->AsArg().name});
+  } else {
+    CHECK(0) << "Unsupport cast type";
+  }
+
+  cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
+
+  // create kernels
+  auto kernels = cast_op->CreateKernels(graph->valid_places());
+  std::vector<std::unique_ptr<KernelBase>> selected_kernels;
+  bool is_found = false;
+  for (auto& kernel : kernels) {
+    if (op_type == "cast") {
+      const Type* in_arg_ty = kernel->GetInputDeclType("X");
+      if (PrecisionCompatibleTo(*in_arg_ty, *cast_type)) {
+        is_found = true;
+      }
+    } else if (op_type == "transpose") {
+      is_found = true;
+    } else if (op_type == "io_copy") {
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+      if (TargetCompatibleTo(*in_arg_ty, *cast_type) &&
+          TargetCompatibleTo(*out_arg_ty, *cur_node->AsArg().type)) {
+        is_found = true;
+      }
+    } else {
+      CHECK(0) << "Unsupport cast type";
+    }
+    if (is_found) {
+      selected_kernels.emplace_back(std::move(kernel));
+      // we pick the kernel
+      cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
+      auto& stmt = cast_inst->AsStmt();
+      stmt.picked_kernel().SetContext(
+          ContextScheduler::Global().NewContext(stmt.picked_kernel().target()));
+      break;
+    }
+  }
+  CHECK(is_found) << "Can't find a Cast kernel for Cast op: "
+                  << cur_node->AsArg().name << "->" << op_type;
+  // modify links
+  DirectedLink(cast_arg, cast_inst);
+  DirectedLink(cast_inst, cur_node);
+  return cast_arg;
+}
+
+void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
+                                      Node* head_node,
+                                      Node* inst_node,
+                                      const Type* inst_type) {
+  const auto* head_type = head_node->AsArg().type;
+
+  // break original link
+  RemoveDirectedLink(head_node, inst_node);
+
+  auto* cur_node = head_node;
+  const auto name_prefix =
+      head_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
+
+  // layout cast node
+  if (head_type->layout() != inst_type->layout()) {
+    cur_node = InsertCastBefore(
+        "transpose",
+        name_prefix + "transpose",
+        graph,
+        cur_node,
+        inst_node,
+        LiteType::GetTensorTy(
+            head_type->target(), head_type->precision(), inst_type->layout()));
+  }
+
+  // precision cast node
+  if (head_type->precision() != inst_type->precision()) {
+    cur_node = InsertCastBefore(
+        "cast",
+        name_prefix + "cast",
+        graph,
+        cur_node,
+        inst_node,
+        LiteType::GetTensorTy(
+            head_type->target(), inst_type->precision(), inst_type->layout()));
+  }
+
+  // io copy
+  cur_node = InsertCastBefore(
+      "io_copy",
+      name_prefix + "io_copy",
+      graph,
+      cur_node,
+      inst_node,
+      LiteType::GetTensorTy(
+          inst_type->target(), inst_type->precision(), inst_type->layout()));
+
+  // connect cur_node to inst_node
+  DirectedLink(cur_node, inst_node);
+
+  // reset opdesc and update kernel information
+  UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(),
+                head_node->AsArg().name,
+                cur_node->AsArg().name);
+  // for subgraph op, modify the BlockDesc
+  auto* sub_block_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
+                             inst_node->AsStmt().op().get())
+                             ->GetSubBlock();
+  for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) {
+    auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
+    UpdateInputTo(
+        sub_block_op_desc, head_node->AsArg().name, cur_node->AsArg().name);
+  }
+
+  // recreate the op
+  RecreateOp(inst_node, graph);
+
+  graph->CheckValid();
+}
+
+void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,
+                                              const Type** arg_type,
+                                              SSAGraph* graph) {
+  CHECK(inst_node->IsStmt());
+  constexpr auto subgraph_target = TARGET(kMLU);
+  constexpr auto subgraph_layout = DATALAYOUT(kNHWC);
+
+  // get subgraph's valid precision
+  const auto& places = graph->valid_places();
+  std::set<::paddle::lite_api::PrecisionType> prec_set;
+  for (const auto& place : places) {
+    if (place.target == TARGET(kMLU)) {
+      prec_set.insert(place.precision);
+    }
+  }
+
+  // get subgraph op's type info
+  size_t kernel_size = inst_node->AsStmt().kernels().size();
+  CHECK_GT(kernel_size, 0);
+  VLOG(4) << "subgraph kernel size: " << kernel_size;
+
+  for (size_t i = 0; i < kernel_size; ++i) {
+    auto* kernel = inst_node->AsStmt().kernels()[i].get();
+    VLOG(4) << i << "th kernel: " << TargetToStr(kernel->target()) << ", "
+            << PrecisionToStr(kernel->precision()) << ", "
+            << DataLayoutToStr(kernel->layout());
+  }
+
+  for (size_t i = 0; i < kernel_size; ++i) {
+    auto* kernel = inst_node->AsStmt().kernels()[i].get();
+    CHECK(kernel->target() == subgraph_target);
+    CHECK(kernel->layout() == subgraph_layout);
+    if (prec_set.count(kernel->precision()) == 1) {
+      const auto subgraph_precision = kernel->precision();
+      CHECK(subgraph_precision == PRECISION(kFloat) ||
+            subgraph_precision == PRECISION(kFP16))
+          << "Mlu node has unsupport precision";
+      VLOG(4) << "picked kernel precision: "
+              << PrecisionToStr(subgraph_precision);
+      *arg_type = LiteType::GetTensorTy(
+          subgraph_target, subgraph_precision, subgraph_layout);
+      break;
+    }
+  }
+}
+
+bool MLUPostprocessPass::NeedInsert(Node* node, const Type* inst_type) {
+  CHECK(node->IsArg());
+
+  // some op, for example batch_norm, has output nodes useless
+  if (node->outlinks.size() == 0) {
+    return false;
+  }
+
+  // check if node is weight or persistent
+  bool is_persist = node->AsArg().is_weight || node->AsArg().is_persist;
+  if (is_persist) {
+    VLOG(4) << "Persistent arg name: " << node->AsArg().name
+            << " is_weight: " << node->AsArg().is_weight
+            << " is_persist: " << node->AsArg().is_persist;
+    return false;
+  }
+
+  const auto target = node->AsArg().type->target();
+  const auto precision = node->AsArg().type->precision();
+  const auto layout = node->AsArg().type->layout();
+  VLOG(4) << "arg name: " << node->AsArg().name
+          << " type: " << TargetToStr(target) << ", "
+          << PrecisionToStr(precision) << ", " << DataLayoutToStr(layout);
+
+  // do not insert nodes if previous node is on mlu already
+  if (target == inst_type->target()) {
+    CHECK(layout == inst_type->layout()) << "Mlu node has wrong layout";
+    return false;
+  }
+
+  return true;
+}
+
+void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
+                                     Node* tail_node,
+                                     Node* inst_node,
+                                     const Type* inst_type) {
+  const auto* tail_type = tail_node->AsArg().type;
+
+  // break original link
+  RemoveDirectedLink(inst_node, tail_node);
+
+  auto* cur_node = tail_node;
+  const auto name_prefix =
+      tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
+
+  // layout cast node
+  if (tail_type->layout() != inst_type->layout()) {
+    cur_node = InsertCastAfter(
+        "transpose",
+        name_prefix + "transpose",
+        graph,
+        cur_node,
+        inst_node,
+        LiteType::GetTensorTy(
+            tail_type->target(), tail_type->precision(), inst_type->layout()));
+  }
+
+  // precision cast node
+  if (tail_type->precision() != inst_type->precision()) {
+    cur_node = InsertCastAfter(
+        "cast",
+        name_prefix + "cast",
+        graph,
+        cur_node,
+        inst_node,
+        LiteType::GetTensorTy(
+            tail_type->target(), inst_type->precision(), inst_type->layout()));
+  }
+
+  // io copy
+  cur_node = InsertCastAfter(
+      "io_copy",
+      name_prefix + "io_copy",
+      graph,
+      cur_node,
+      inst_node,
+      LiteType::GetTensorTy(
+          inst_type->target(), inst_type->precision(), inst_type->layout()));
+
+  // connect cur_node to inst_node
+  DirectedLink(inst_node, cur_node);
+
+  // reset opdesc and update kernel information
+  UpdateOutputTo(inst_node->AsStmt().op()->mutable_op_info(),
+                 tail_node->AsArg().name,
+                 cur_node->AsArg().name);
+  // for subgraph op, modify the BlockDesc
+  auto* sub_block_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
+                             inst_node->AsStmt().op().get())
+                             ->GetSubBlock();
+  for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) {
+    auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
+    UpdateOutputTo(
+        sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
+  }
+
+  // recreate the op
+  RecreateOp(inst_node, graph);
+
+  graph->CheckValid();
+}
+
+void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) {
+  auto original_selected_kernel =
+      std::move(inst_node->AsStmt().kernels().front());
+  auto updated_op_info = *inst_node->AsStmt().mutable_op_info();
+
+  inst_node->AsStmt().ResetOp(updated_op_info, graph->valid_places());
+  inst_node->AsStmt().kernels().clear();
+  inst_node->AsStmt().kernels().emplace_back(
+      std::move(original_selected_kernel));
+  for (auto& kernel : inst_node->AsStmt().kernels()) {
+    VLOG(4) << "kernel info: " << kernel->name();
+    inst_node->AsStmt().op()->AttachKernel(kernel.get());
+  }
+}
+
+void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
+  for (auto& node : graph->mutable_nodes()) {
+    if (!node.IsStmt()) continue;
+    if (node.AsStmt().op_type() == "feed") {
+      for (auto& out : node.outlinks) {
+        bool change = true;
+        for (auto& inst : out->outlinks) {
+          if (inst->AsStmt().op_type() != "subgraph") {
+            change = false;
+            break;
+          }
+        }
+        if (change) {
+          const auto* old_type = out->AsArg().type;
+          out->AsArg().type =
+              LiteType::GetTensorTy(old_type->target(),
+                                    old_type->precision(),
+                                    ::paddle::lite_api::DataLayoutType::kNHWC,
+                                    old_type->device());
+        }
+      }
+    }
+    if (node.AsStmt().op_type() == "fetch") {
+      for (auto& inp : node.inlinks) {
+        bool change = true;
+        for (auto& inst : inp->inlinks) {
+          if (inst->AsStmt().op_type() != "subgraph") {
+            change = false;
+            break;
+          }
+        }
+        if (change) {
+          const auto* old_type = inp->AsArg().type;
+          inp->AsArg().type =
+              LiteType::GetTensorTy(old_type->target(),
+                                    old_type->precision(),
+                                    ::paddle::lite_api::DataLayoutType::kNHWC,
+                                    old_type->device());
+        }
+      }
+    }
+  }
+}
+
+void MLUPostprocessPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  // currently for non-persistent input and output args, mlu subgraph op
+  // only support float16/float32 data type
+
+  // in two situations as folllows:
+  // 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch;
+  // arg_in and arg_out are assumed to be NHWC which user should be aware of.
+  // Thus here we change these args' layout to NHWC
+  ModifyLayout(graph.get());
+
+  // insert io_copy, layout and precision cast of subgraph's inputs and outputs
+  for (auto& node : graph->mutable_nodes()) {
+    if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") {
+      const Type* subgraph_arg_type = nullptr;
+      GetSubgraphOpArgType(&node, &subgraph_arg_type, graph.get());
+
+      auto links_tmp = node.inlinks;
+      for (auto p_in : links_tmp) {
+        if (NeedInsert(p_in, subgraph_arg_type)) {
+          InsertBefore(graph.get(), p_in, &node, subgraph_arg_type);
+        }
+      }
+      links_tmp.assign(node.outlinks.begin(), node.outlinks.end());
+      for (auto p_out : links_tmp) {
+        if (NeedInsert(p_out, subgraph_arg_type)) {
+          InsertAfter(graph.get(), p_out, &node, subgraph_arg_type);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(mlu_postprocess_pass, paddle::lite::mir::MLUPostprocessPass)
+    .BindTargets({TARGET(kMLU)});
diff --git a/lite/core/mir/mlu_postprocess_pass.h b/lite/core/mir/mlu_postprocess_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ffcbc952a44abea272bdd22467d86cd04baa207
--- /dev/null
+++ b/lite/core/mir/mlu_postprocess_pass.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/mir/pass.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+static void UpdateInputTo(cpp::OpDesc* desc,
+                          const std::string& from,
+                          const std::string& to) {
+  for (auto& item : *desc->mutable_inputs()) {
+    for (auto& input : item.second) {
+      if (input == from) {
+        input = to;
+      }
+    }
+  }
+  if (desc->Type() != "subgraph") return;
+  auto input_names =
+      desc->GetAttr<std::vector<std::string>>("input_data_names");
+  for (size_t i = 0; i < input_names.size(); ++i) {
+    if (input_names[i] == from) {
+      input_names[i] = to;
+    }
+  }
+  desc->SetAttr<std::vector<std::string>>("input_data_names", input_names);
+}
+
+static void UpdateOutputTo(cpp::OpDesc* desc,
+                           const std::string& from,
+                           const std::string& to) {
+  for (auto& item : *desc->mutable_outputs()) {
+    for (auto& output : item.second) {
+      if (output == from) {
+        output = to;
+      }
+    }
+  }
+  if (desc->Type() != "subgraph") return;
+  auto output_names =
+      desc->GetAttr<std::vector<std::string>>("output_data_names");
+  for (size_t i = 0; i < output_names.size(); ++i) {
+    if (output_names[i] == from) {
+      output_names[i] = to;
+    }
+  }
+  desc->SetAttr<std::vector<std::string>>("output_data_names", output_names);
+}
+
+/*
+ * The pass changes the node's target to mlu which follows a mlu subgraph op
+ * */
+class MLUPostprocessPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+
+ private:
+  void GetSubgraphOpArgType(Node* inst_node,
+                            const Type** arg_type,
+                            SSAGraph* graph);
+
+  void ModifyLayout(SSAGraph* graph);
+
+  bool NeedInsert(Node* node, const Type* inst_type);
+
+  void InsertBefore(SSAGraph* graph,
+                    Node* head_node,
+                    Node* inst_node,
+                    const Type* type);
+
+  void InsertAfter(SSAGraph* graph,
+                   Node* tail_node,
+                   Node* inst_node,
+                   const Type* type);
+
+  Node* InsertCastBefore(const std::string& op_type,
+                         const std::string& cast_arg_name,
+                         SSAGraph* graph,
+                         Node* cur_node,
+                         Node* inst_node,
+                         const Type* cast_type);
+
+  Node* InsertCastAfter(const std::string& op_type,
+                        const std::string& cast_arg_name,
+                        SSAGraph* graph,
+                        Node* cur_node,
+                        Node* inst_node,
+                        const Type* cast_type);
+
+  void RecreateOp(Node* inst_node, SSAGraph* graph);
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/node.h b/lite/core/mir/node.h
index e7c44d2be689a9d890158c097e198314413d1ba3..45b15812fadb0789edea3f89fb00b4612bdb010f 100644
--- a/lite/core/mir/node.h
+++ b/lite/core/mir/node.h
@@ -85,7 +85,7 @@ class Node {
   struct Arg {
     std::string name;
     int id{0};
-    const Type* type{};
+    const Type* type{nullptr};
     // Weight is a special kind of argument, it is marked as weight explicitly
     // so that some weight related optimization can take place.
     bool is_weight{false};
diff --git a/lite/core/mir/quantized_op_attributes_inference_pass.cc b/lite/core/mir/quantized_op_attributes_inference_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..40cad8f6af75300ab85753b16e391daeeadc6c2f
--- /dev/null
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/quantized_op_attributes_inference_pass.h"
+#include <algorithm>
+#include <list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/graph_visualize_pass.h"
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void QuantizedOpAttributesInferencePass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  // Only for fully quantized model which is only supported by MTK and RK NPU.
+  // Replace the output_scale with the input_scale of the adjacent quantized
+  // ops, and fix the missing of the attribute 'enable_int8'.
+  for (auto& op_node : graph->StmtTopologicalOrder()) {
+    if (!op_node->IsStmt()) continue;
+    auto& inst = op_node->AsStmt();
+    auto op_info = inst.op_info();
+    auto op_type = op_info->Type();
+    if (!op_info->HasAttr("input_scale")) continue;
+    bool found = false;
+    float output_scale;
+    for (auto out_var_node : op_node->outlinks) {
+      CHECK(out_var_node->IsArg());
+      for (auto out_op_node : out_var_node->outlinks) {
+        CHECK(out_op_node->IsStmt());
+        auto& out_inst = out_op_node->AsStmt();
+        auto out_op_info = out_inst.op_info();
+        if (!out_op_info->HasAttr("input_scale")) continue;
+        auto input_scale = out_op_info->GetAttr<float>("input_scale");
+        if (!found) {
+          found = true;
+          output_scale = input_scale;
+        } else {
+          CHECK_EQ(output_scale, input_scale);
+        }
+      }
+    }
+    if (found) {
+      inst.mutable_op_info()->SetAttr("output_scale", output_scale);
+    } else if (op_info->HasAttr("output_scale")) {
+      int bit_length = op_info->GetAttr<int>("bit_length");
+      int range = (1 << (bit_length - 1)) - 1;
+      output_scale = op_info->GetAttr<float>("output_scale");
+      inst.mutable_op_info()->SetAttr("output_scale", output_scale / range);
+    }
+    if (op_info->HasAttr("output_scale")) {
+      inst.mutable_op_info()->SetAttr("enable_int8", true);
+    }
+  }
+  VLOG(5) << "\n" << Visualize(graph.get());
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(quantized_op_attributes_inference_pass,
+                  paddle::lite::mir::QuantizedOpAttributesInferencePass)
+    .BindTargets({TARGET(kNPU)});
diff --git a/lite/core/mir/quantized_op_attributes_inference_pass.h b/lite/core/mir/quantized_op_attributes_inference_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b475e0b3d662a9837b7766efb4ccc8f87037b7a
--- /dev/null
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/core/mir/pass.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class QuantizedOpAttributesInferencePass : public mir::StmtPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/ssa_graph.cc b/lite/core/mir/ssa_graph.cc
index 2b5b65ce5903ede41137311c585c0e87eaaa0e9d..6c45ce828249c3e236706c297db3d434c71c351a 100644
--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
@@ -140,9 +140,18 @@ void SSAGraph::Build(const Program &program,
         arg_node->AsArg(name, node_storage_.size() - 1);
         arg_update_node_map_[name] = arg_node;
       }
-      if (var_types.count(name) && !arg_node->arg()->type) {
-        arg_node->arg()->type = LiteType::GetTensorTy(
-            TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
+      if (var_types.count(name)) {
+        if (!arg_node->arg()->type) {
+          arg_node->arg()->type = LiteType::GetTensorTy(
+              TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
+        }
+        // Store the original data type of the output tensors for
+        // type_precision_cast_pass, to keep the consistency between the
+        // output types of original graph and optimized graph's
+        if (op->op_info()->Type() == "fetch") {
+          op->mutable_op_info()->SetAttr<int>(
+              "data_type", static_cast<int>(var_types[name]));
+        }
       }
       if (is_weights(name)) arg_node->AsArg().is_weight = true;
       CHECK(arg_node->IsRoleSet());
diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h
index f655b298bf2d800f4adf142ad14b8ac05ca00482..6d45be3b898271f0801d289d16235d3fb5fdd706 100644
--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
@@ -58,7 +58,7 @@ class StaticKernelPickPass : public mir::StmtPass {
       const std::unordered_map<std::string, PrecisionType>& out_types,
       const std::vector<std::string>& in_names,
       const std::vector<std::string>& out_names) {
-    CHECK_GT(places.size(), 0) << "valid_places is empty.";
+    CHECK_GT(places.size(), static_cast<size_t>(0)) << "valid_places is empty.";
     float final_score{-1.};
     Place winner_place{places[0]};
     const int kMax =
@@ -145,11 +145,12 @@ class StaticKernelPickPass : public mir::StmtPass {
     }
 
     VLOG(4) << "[score(final)]:" << final_score;
-    VLOG(4) << "-------- pick summary --------";
-    VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
+    VLOG(2) << "-------- pick summary for " << instruct.op_type()
+            << " --------";
+    VLOG(2) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
             << " " << DataLayoutToStr(winner_place.layout) << " "
             << TargetToStr(winner_place.target);
-    VLOG(4) << " ===> kernel.place():"
+    VLOG(2) << " ===> kernel.place():"
             << PrecisionToStr(kernel.place().precision) << " "
             << DataLayoutToStr(kernel.place().layout) << " "
             << TargetToStr(kernel.place().target);
diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc
index 6d48b053a1a4140252d35e85d2351644d3c216e9..6844fd96688d5086b47d66a32f770a757f56fda4 100644
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -22,6 +22,9 @@
 #include "lite/core/mir/pass_registry.h"
 #include "lite/core/mir/pattern_matcher.h"
 #include "lite/operators/subgraph_op.h"
+#include "lite/utils/env.h"
+#include "lite/utils/io.h"
+#include "lite/utils/string.h"
 
 namespace paddle {
 namespace lite {
@@ -63,11 +66,11 @@ std::string SubgraphVisualizer::operator()() {
     } else {
       exists_ops[op_type]++;
     }
-    auto op_name = op_type + std::to_string(exists_ops[op_type]);
+    auto op_name = op_type + paddle::lite::to_string(exists_ops[op_type]);
     std::string op_color = "white";
     if (subgraph_indices.count(node)) {
       auto subgraph_idx = subgraph_indices[node];
-      op_name += "_subgraph_" + std::to_string(subgraph_idx);
+      op_name += "_subgraph_" + paddle::lite::to_string(subgraph_idx);
       op_color = subgraph_colors[subgraph_idx % subgraph_colors.size()];
     }
     dot.AddNode(op_name,
@@ -209,8 +212,82 @@ void SubgraphDetector::FlexibleDFS(
   }
 }
 
+std::unordered_set<Node *> SubgraphDetector::GetExcludedNodesFromConfigFile() {
+  // get exclude nodes from config file
+  std::unordered_set<Node *> excluded_nodes;
+  std::string config_file_path =
+      GetStringFromEnv(SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE);
+  if (!IsFileExists(config_file_path)) {
+    return excluded_nodes;
+  }
+  std::vector<std::string> lines = ReadLines(config_file_path);
+
+  for (std::string line : lines) {
+    if (line.empty()) continue;
+    std::vector<std::string> node_info = Split(line, ":");
+    std::string op_type = node_info.at(0);
+    std::vector<std::string> in_vars_name;
+    if (node_info.size() > 1) {
+      in_vars_name = Split(node_info.at(1), ",");
+    }
+    std::vector<std::string> out_vars_name;
+    if (node_info.size() > 2) {
+      out_vars_name = Split(node_info.at(2), ",");
+    }
+
+    for (auto &node : graph_->mutable_nodes()) {
+      if (node.IsArg()) continue;
+      auto stmt = node.stmt();
+      if (op_type != stmt->op_type()) continue;
+      auto in_nodes = node.inlinks;
+      auto out_nodes = node.outlinks;
+      if (in_vars_name.size() > in_nodes.size() ||
+          out_vars_name.size() > out_nodes.size()) {
+        continue;
+      }
+
+      bool matched = true;
+
+      for (auto in_var_name : in_vars_name) {
+        bool find_var = false;
+        for (auto *in_node : in_nodes) {
+          if (in_node->arg()->name == in_var_name) {
+            find_var = true;
+            break;
+          }
+        }
+        if (!find_var) {
+          matched = false;
+          break;
+        }
+      }
+
+      for (auto out_var_name : out_vars_name) {
+        bool find_var = false;
+        for (auto *out_node : out_nodes) {
+          if (out_node->arg()->name == out_var_name) {
+            find_var = true;
+            break;
+          }
+        }
+        if (!find_var) {
+          matched = false;
+          break;
+        }
+      }
+
+      if (matched) {
+        excluded_nodes.insert(&node);
+      }
+    }
+  }
+
+  return excluded_nodes;
+}
+
 void SubgraphDetector::InitNodes(node_map_t *nodes) {
   // Initialize and mark the subgraph detector nodes based on teller.
+  std::unordered_set<Node *> excluded_nodes = GetExcludedNodesFromConfigFile();
   for (auto &it : *nodes) {
     for (auto &in_node : it.first->inlinks) {
       it.second->inlinks.push_back((*nodes)[in_node]);
@@ -218,7 +295,7 @@ void SubgraphDetector::InitNodes(node_map_t *nodes) {
     for (auto &out_node : it.first->outlinks) {
       it.second->outlinks.push_back((*nodes)[out_node]);
     }
-    if (teller_(it.first)) {
+    if (teller_(it.first) && excluded_nodes.count(it.first) == 0) {
       it.second->marked = true;
       if (it.first->IsStmt()) {
         // If a function is inside the subgraph, mark all the output variables
@@ -331,7 +408,7 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
   cpp::OpDesc subgraph_op_desc;
   subgraph_op_desc.SetType("subgraph");
 
-  // Create a new sub block desc for storing all of Ops an Vars of the target
+  // Create a new sub block desc for storing all of Ops and Vars of the target
   // subgraph and sub_block_idx is set as a attribute of subgraph op,
   // sub_block_idx < 0 means it's a new subgraph op
   int sub_block_idx = -(subgraph_idx + 1);
@@ -341,9 +418,6 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
   for (auto &op_node : subgraph_nodes) {
     auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
     *sub_block_op_desc = *op_node->AsStmt().op_info();
-    sub_block_op_desc->SetAttr(
-        kKernelTypeAttr,
-        op_node->AsStmt().picked_kernel().SerializedKernelType());
   }
   subgraph_op_desc.SetAttr<int32_t>("sub_block", sub_block_idx);
 
@@ -375,6 +449,37 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
   subgraph_op_desc.SetAttr<std::vector<std::string>>("output_data_names",
                                                      output_var_names);
 
+  // Set input/output scale values of input/output var nodes for
+  // type_precision_cast_pass.
+  std::vector<float> input_data_scales;
+  std::vector<float> output_data_scales;
+  for (auto &var_node : input_var_nodes) {
+    auto any_op_node = var_node->outlinks.front();
+    CHECK(any_op_node->IsStmt());
+    auto &any_inst = any_op_node->AsStmt();
+    if (any_inst.op_info()->HasAttr("input_scale")) {
+      input_data_scales.push_back(
+          any_inst.op_info()->GetAttr<float>("input_scale"));
+    }
+  }
+  for (auto &var_node : output_var_nodes) {
+    auto any_op_node = var_node->inlinks.front();
+    CHECK(any_op_node->IsStmt());
+    auto &any_inst = any_op_node->AsStmt();
+    if (any_inst.op_info()->HasAttr("output_scale")) {
+      output_data_scales.push_back(
+          any_inst.op_info()->GetAttr<float>("output_scale"));
+    }
+  }
+  if (input_data_scales.size() > 0) {
+    subgraph_op_desc.SetAttr<std::vector<float>>("input_data_scales",
+                                                 input_data_scales);
+  }
+  if (output_data_scales.size() > 0) {
+    subgraph_op_desc.SetAttr<std::vector<float>>("output_data_scales",
+                                                 output_data_scales);
+  }
+
   // Set all of the inputs and outputs to the target subgraph op
   // To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram()
   for (auto &var_node : weight_var_nodes) {
@@ -413,12 +518,6 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
     IR_OP_VAR_LINK(subgraph_op_node, var_node);
   }
 
-  // Create and assign the context to the picked kernel of the new subgraph
-  // node
-  auto &inst = subgraph_op_node->AsStmt();
-  inst.picked_kernel().SetContext(
-      ContextScheduler::Global().NewContext(inst.picked_kernel().target()));
-
   // Remove subgraph nodes and unused var nodes
   auto nodes2rm = GetNodes2RM(subgraph_nodes,
                               {input_var_nodes,
diff --git a/lite/core/mir/subgraph/subgraph_detector.h b/lite/core/mir/subgraph/subgraph_detector.h
index b6873655e976a785383269972221f001196431f8..567f2446a2af31c739b049005d2960ffbc802ef9 100644
--- a/lite/core/mir/subgraph/subgraph_detector.h
+++ b/lite/core/mir/subgraph/subgraph_detector.h
@@ -63,6 +63,7 @@ class SubgraphDetector {
     node_dat_t* UnionFindAncestor();
     void UnionFindCombine(node_dat_t* candidate);
   };
+
   SubgraphDetector(SSAGraph* graph, const SubgraphTeller& teller)
       : graph_(graph), teller_(teller) {}
   std::vector<std::vector<Node*>> operator()();
@@ -71,7 +72,11 @@ class SubgraphDetector {
                    bool reverse,
                    const std::function<bool(const node_dat_t*)>& enter,
                    const std::function<bool(const node_dat_t*)>& leave);
+
+  std::unordered_set<Node*> GetExcludedNodesFromConfigFile();
+
   void InitNodes(node_map_t* nodes);
+
   std::vector<std::vector<Node*>> ExtractSubgraphs(node_map_t* nodes);
 
  protected:
diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc
index 3b0d7c5cd5c8a0d0901750148359f430b6d49894..974772a9839c1e089359be3ae98e1833645ccd7a 100644
--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -39,7 +39,7 @@ std::vector<std::string> AddFCDesc(
   CHECK_EQ(input_var_names.size(), 1);
   CHECK_EQ(wshape.size(), 2);
   static int id = 0;
-  std::string prefix = "fc_" + std::to_string(id);
+  std::string prefix = "fc_" + paddle::lite::to_string(id);
   auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
 
   auto* wgt = block_desc->AddVar<cpp::VarDesc>();
@@ -76,7 +76,7 @@ std::vector<std::string> AddElementwiseAddDesc(
     const std::vector<std::string>& input_Y_names) {
   // CHECK_EQ(input_var_names.size(), 2);
   static int id = 0;
-  std::string prefix = "elementwise_add_" + std::to_string(id);
+  std::string prefix = "elementwise_add_" + paddle::lite::to_string(id);
   auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
   auto* out = block_desc->AddVar<cpp::VarDesc>();
 
@@ -100,7 +100,7 @@ std::vector<std::string> AddFeedDesc(
     const std::vector<std::string>& input_X_names) {
   // CHECK_EQ(input_var_names.size(), 1);
   static int id = 0;
-  std::string prefix = "feed_" + std::to_string(id);
+  std::string prefix = "feed_" + paddle::lite::to_string(id);
   auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
   auto* out = block_desc->AddVar<cpp::VarDesc>();
 
@@ -123,7 +123,7 @@ std::vector<std::string> AddFetchDesc(
     const std::vector<std::string>& input_X_names) {
   // CHECK_EQ(input_var_names.size(), 1);
   static int id = 0;
-  std::string prefix = "fetch_" + std::to_string(id);
+  std::string prefix = "fetch_" + paddle::lite::to_string(id);
   auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
   auto* out = block_desc->AddVar<cpp::VarDesc>();
 
@@ -220,8 +220,8 @@ TEST(Subgraph, detect_custom_model) {
   };
   std::vector<std::vector<mir::Node*>> subgraphs =
       mir::SubgraphDetector(graph.get(), teller)();
-  ASSERT_EQ(subgraphs.size(), 1);
   mir::SubgraphVisualizer(graph.get(), subgraphs)();
+  ASSERT_EQ(subgraphs.size(), 1);
 }
 
 }  // namespace lite
diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc
index 247795a86ce2cbe962b161311f7845622ee3983e..7117e1b3399fe823194f7f1a4d4c239099580955 100644
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -15,11 +15,9 @@
 #include <gtest/gtest.h>
 #include <cmath>
 #include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
 #include "lite/api/test_helper.h"
 #include "lite/utils/cp_logging.h"
+#include "lite/utils/string.h"
 
 DEFINE_string(model_file, "", "model file path of combined protobuf model");
 DEFINE_string(params_file, "", "params file path of combined protobuf model");
@@ -34,43 +32,17 @@ namespace lite {
 // The helper functions for loading and running model from command line and
 // verifying output data
 std::vector<std::string> TypeParsing(std::string text) {
-  std::vector<std::string> types;
-  while (!text.empty()) {
-    size_t index = text.find_first_of(":");
-    std::string type = text.substr(0, index);
-    VLOG(3) << type;
-    types.push_back(type);
-    if (index == std::string::npos) {
-      break;
-    } else {
-      text = text.substr(index + 1);
-    }
-  }
-  return types;
+  return Split(text, ":");
 }
 
 std::vector<std::vector<int64_t>> ShapeParsing(std::string text) {
   std::vector<std::vector<int64_t>> shapes;
-  while (!text.empty()) {
-    size_t index = text.find_first_of(":");
-    std::string slice = text.substr(0, index);
-    std::vector<int64_t> shape;
-    while (!slice.empty()) {
-      size_t index = slice.find_first_of(",");
-      int d = atoi(slice.substr(0, index).c_str());
-      VLOG(3) << d;
-      shape.push_back(d);
-      if (index == std::string::npos) {
-        break;
-      } else {
-        slice = slice.substr(index + 1);
-      }
-    }
-    shapes.push_back(shape);
-    if (index == std::string::npos) {
-      break;
-    } else {
-      text = text.substr(index + 1);
+  std::vector<std::string> shape_strings = Split(text, ":");
+  shapes.resize(shape_strings.size());
+  for (int i = 0; i < shape_strings.size(); i++) {
+    std::vector<std::string> shape_nums = Split(shape_strings[i], ",");
+    for (auto shape_num : shape_nums) {
+      shapes[i].push_back(atoi(shape_num.c_str()));
     }
   }
   return shapes;
diff --git a/lite/core/mir/subgraph_cast_display_pass.cc b/lite/core/mir/subgraph_cast_display_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3a2c94d23298fcb607de0bf821d0dc92c95da7bb
--- /dev/null
+++ b/lite/core/mir/subgraph_cast_display_pass.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/pass.h"
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class SubgraphCastDisplayPass : public DebugPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    VLOG(3) << "== Argument types ==";
+    for (auto& node : graph->mutable_nodes()) {
+      if (!node.IsArg()) continue;
+
+      auto* type = node.AsArg().type;
+      if (type) {
+        VLOG(3) << "* ARG " << node.AsArg().name << " type: " << *type;
+      } else {
+        VLOG(3) << "* ARG " << node.AsArg().name << " type: UNK";
+      }
+    }
+    VLOG(3) << "---------------------";
+
+    //
+    VLOG(0) << "== SubgraphOp Debug Info ==";
+    for (auto& node : graph->mutable_nodes()) {
+      if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") {
+        VLOG(0) << "FOUND SUBGRAPH OP";
+        display_debug_info(node, "subgraph");
+        break;
+      }
+    }
+    VLOG(0) << "---------------------";
+  }
+
+  void display_debug_info(const Node& node,
+                          std::string op_type,
+                          bool display_in_nodes = true,
+                          bool display_out_nodes = true) {
+    CHECK(node.IsStmt());
+    VLOG(0) << node.AsStmt();
+    if (display_in_nodes) {
+      for (auto p_in_arg_node : node.inlinks) {
+        CHECK(p_in_arg_node->IsArg());
+        VLOG(0) << "* ARG[IN] " << p_in_arg_node->AsArg().name
+                << " type: " << *p_in_arg_node->AsArg().type
+                << " is_weight: " << p_in_arg_node->AsArg().is_weight
+                << " is_persist: " << p_in_arg_node->AsArg().is_persist
+                << " input_count: " << p_in_arg_node->inlinks.size();
+        if (p_in_arg_node->inlinks.size() == 0) {
+          VLOG(0) << "** END with No Op";
+        }
+        for (auto p_in_stmt_node : p_in_arg_node->inlinks) {
+          CHECK(p_in_stmt_node->IsStmt());
+          std::string stmt_op_type = p_in_stmt_node->AsStmt().op_type();
+          if (stmt_op_type == "cast" || stmt_op_type == "transpose" ||
+              stmt_op_type == "io_copy") {
+            display_debug_info(*p_in_stmt_node, stmt_op_type, true, false);
+          } else {
+            VLOG(0) << "** END with op type: " << stmt_op_type;
+          }
+        }
+      }
+    }
+    if (display_out_nodes) {
+      for (auto p_out_arg_node : node.outlinks) {
+        CHECK(p_out_arg_node->IsArg());
+        VLOG(0) << "* ARG[OUT] " << p_out_arg_node->AsArg().name
+                << " type: " << *p_out_arg_node->AsArg().type
+                << " is_weight: " << p_out_arg_node->AsArg().is_weight
+                << " is_persist: " << p_out_arg_node->AsArg().is_persist
+                << " output_count: " << p_out_arg_node->outlinks.size();
+        if (p_out_arg_node->outlinks.size() == 0) {
+          VLOG(0) << "** END with No Op";
+        }
+        for (auto p_out_stmt_node : p_out_arg_node->outlinks) {
+          CHECK(p_out_stmt_node->IsStmt());
+          std::string stmt_op_type = p_out_stmt_node->AsStmt().op_type();
+          if (stmt_op_type == "cast" || stmt_op_type == "transpose" ||
+              stmt_op_type == "io_copy") {
+            display_debug_info(*p_out_stmt_node, stmt_op_type, false, true);
+          } else {
+            VLOG(0) << "** END with op type: " << stmt_op_type;
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(subgraph_cast_display_pass,
+                  paddle::lite::mir::SubgraphCastDisplayPass)
+    .BindTargets({TARGET(kAny)});
diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc
index b3b7a858f68367ac789f390c6bd3bd94873f77d5..1133e5ba8203ec9fea177844a6311c993f6b8ff7 100644
--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
@@ -20,6 +20,8 @@
 #include <vector>
 #include "lite/core/mir/graph_visualize_pass.h"
 #include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/type_precision_cast_pass.h"
+#include "lite/operators/subgraph_op.h"
 #include "lite/utils/string.h"
 
 namespace paddle {
@@ -39,8 +41,9 @@ void TypeLayoutTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     VLOG(4) << "!node->IsStmt():" << !node->IsStmt();
     if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue;
     auto inlinks = node->inlinks;
-    VLOG(4) << "node->AsStmt().desc:" << node->AsStmt().desc
-            << " inlinks.size():" << inlinks.size();
+    VLOG(4) << "============== node->AsStmt().op_type():"
+            << node->AsStmt().op_type() << " inlinks.size():" << inlinks.size()
+            << " ================";
     for (auto* in : inlinks) {
       ComplementInputs(graph.get(), node, in);
     }
@@ -66,13 +69,25 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph,
   CHECK(inst.op_info()->GetInputArgname(in_arg_name, &inst_in_tensor_name));
   auto decl_arg_type =
       inst.picked_kernel().GetInputDeclType(inst_in_tensor_name);
+
   CHECK(in->AsArg().type);
-  VLOG(5) << "\n inst_in_tensor_name:" << inst_in_tensor_name
+  VLOG(3) << "\n inst_in_tensor_name:" << inst_in_tensor_name
           << "\n in->AsArg().name:" << in->AsArg().name
           << "\n *in->AsArg().type:" << *in->AsArg().type
           << "\n *decl_arg_type:" << *decl_arg_type
           << "\n inst.op()->DebugString():" << inst.op()->DebugString();
 
+  // TODO(ysh329): conflict if tensor with kARM target but kImageDefault(OpenCL
+  // layout).
+  // not a good judge, but don't find the source of this issue from
+  // static_pick_kernel_pass
+  // to this pass.
+  auto* in_arg_type = const_cast<Type*>(in->AsArg().type);
+  if (in_arg_type->target() == TARGET(kARM) &&
+      in_arg_type->layout() == DATALAYOUT(kImageDefault)) {
+    return;
+  }
+
   if (!DataLayoutCompatible(*in->AsArg().type, *decl_arg_type)) {
     VLOG(4) << "found Layout unmatched tensor: " << in->AsArg().name
             << " for kernel " << inst.op()->DebugString() << " "
@@ -170,9 +185,8 @@ void TypeLayoutTransformPass::AddLayoutInst(
   DirectedLink(layout_output_arg, inst_node);
 
   // reset opdesc and update kernel information
-  UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(),
-                in->AsArg().name,
-                layout_output_name);
+  UpdateInputs(
+      inst_node->AsStmt().op().get(), in->AsArg().name, layout_output_name);
   auto original_selected_kernel =
       std::move(inst_node->AsStmt().kernels().front());
   auto update_op_info = *inst_node->AsStmt().op_info();
@@ -204,6 +218,30 @@ void TypeLayoutTransformPass::SetValidPlaces(
   valid_places_ = valid_places;
 }
 
+void OpenCLTypeLayoutTransformPass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  // Start from inputs of the graph, those should have place set.
+  VLOG(4) << "\n" << Visualize(graph.get());
+  std::list<Node*> nodes;
+  for (auto& node : graph->StmtTopologicalOrder()) {
+    nodes.push_back(node);
+  }
+
+  VLOG(4) << "nodes.size():" << nodes.size();
+  for (auto& node : nodes) {
+    VLOG(4) << "!node->IsStmt():" << !node->IsStmt();
+    if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue;
+    VLOG(1) << "node->AsStmt().op_type():" << node->AsStmt().op_type();
+    if (node->AsStmt().op_type() == "layout" ||
+        node->AsStmt().op_type() == "io_copy") {
+      auto new_op = node->AsStmt().mutable_op_info();
+      int process_type = 1;
+      new_op->SetAttr("process_type", process_type);
+    }
+  }
+  VLOG(4) << "\n" << Visualize(graph.get());
+}
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
@@ -213,3 +251,9 @@ REGISTER_MIR_PASS(type_layout_cast_pass,
     .BindTargets({TARGET(kAny)})
     .BindKernel("layout_once")
     .BindKernel("layout");
+
+REGISTER_MIR_PASS(type_layout_cast_preprocess_pass,
+                  paddle::lite::mir::OpenCLTypeLayoutTransformPass)
+    .BindTargets({TARGET(kAny)})
+    .BindKernel("layout_once")
+    .BindKernel("layout");
diff --git a/lite/core/mir/type_layout_cast_pass.h b/lite/core/mir/type_layout_cast_pass.h
index bf36214e1dce33352468155a6817adda9039727a..4a3e4c02d1053e84dd39bee14a0e01260f0626e4 100644
--- a/lite/core/mir/type_layout_cast_pass.h
+++ b/lite/core/mir/type_layout_cast_pass.h
@@ -24,18 +24,6 @@ namespace paddle {
 namespace lite {
 namespace mir {
 
-static void UpdateInputTo(cpp::OpDesc* desc,
-                          const std::string& from,
-                          const std::string& to) {
-  for (auto& item : *desc->mutable_inputs()) {
-    for (auto& input : item.second) {
-      if (input == from) {
-        input = to;
-      }
-    }
-  }
-}
-
 class TypeLayoutTransformPass : public ProgramPass {
  public:
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
@@ -57,6 +45,15 @@ class TypeLayoutTransformPass : public ProgramPass {
   std::vector<Place> valid_places_;
 };
 
+// add preprocess and postprocess attribute for layout op
+class OpenCLTypeLayoutTransformPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+
+ private:
+  std::vector<Place> valid_places_;
+};
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc
index 2f177383fc2b3a035313c0654c961c0b21a7f197..ecccf89fa76287a3f30756f7138fcce229e8f337 100644
--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
@@ -20,11 +20,116 @@
 #include <vector>
 #include "lite/core/mir/graph_visualize_pass.h"
 #include "lite/core/mir/pass_registry.h"
+#include "lite/operators/subgraph_op.h"
 
 namespace paddle {
 namespace lite {
 namespace mir {
 
+// For the subgraph op, we also need to update the attr 'input_data_names' and
+// the input variables names of the Ops in the subblock.
+void UpdateInputsForSubgraph(OpLite* op,
+                             const std::string& from,
+                             const std::string& to) {
+  auto* op_desc = op->mutable_op_info();
+  auto input_data_names =
+      op_desc->GetAttr<std::vector<std::string>>("input_data_names");
+  std::replace(input_data_names.begin(), input_data_names.end(), from, to);
+  op_desc->SetAttr("input_data_names", input_data_names);
+  auto* subblock_desc = static_cast<operators::SubgraphOp*>(op)->GetSubBlock();
+  CHECK(subblock_desc);
+  for (size_t i = 0; i < subblock_desc->OpsSize(); i++) {
+    auto* subblock_op_desc = subblock_desc->GetOp<cpp::OpDesc>(i);
+    for (auto& subblock_op_input : *subblock_op_desc->mutable_inputs()) {
+      for (auto& subblock_var_name : subblock_op_input.second) {
+        if (subblock_var_name == from) {
+          subblock_var_name = to;
+        }
+      }
+    }
+  }
+}
+
+// Update the input variable names from 'from' to 'to' for the target Op
+void UpdateInputs(OpLite* op, const std::string& from, const std::string& to) {
+  auto* op_desc = op->mutable_op_info();
+  auto op_type = op_desc->Type();
+  for (auto& op_input : *op_desc->mutable_inputs()) {
+    for (auto& var_name : op_input.second) {
+      if (var_name == from) {
+        var_name = to;
+      }
+    }
+  }
+  if (op_type == "subgraph") {
+    UpdateInputsForSubgraph(op, from, to);
+  }
+}
+
+// Infer the scale value for the new calib op from the subgraph op
+static bool InferScaleFromSubgraph(std::string var_name,
+                                   const OpInfo* op_info,
+                                   float* scale,
+                                   bool reverse = false) {
+  std::string attr_name = reverse ? "output_data_names" : "input_data_names";
+  if (!op_info->HasAttr(attr_name)) return false;
+  auto input_or_output_names =
+      op_info->GetAttr<std::vector<std::string>>(attr_name);
+  attr_name = reverse ? "output_data_scales" : "input_data_scales";
+  if (!op_info->HasAttr(attr_name)) return false;
+  auto input_or_output_scales = op_info->GetAttr<std::vector<float>>(attr_name);
+  auto size = input_or_output_names.size();
+  CHECK(size == input_or_output_scales.size());
+  for (int i = 0; i < size; i++) {
+    if (input_or_output_names[i] == var_name) {
+      *scale = input_or_output_scales[i];
+      return true;
+    }
+  }
+  return false;
+}
+
+// Infer the scale value for the new calib op from the input_scale of the
+// current op and output_scale of the previous op.
+// case 1: prev_op->var_node->op_node(int8->any op, with input_scale).
+// case 2: prev_op->var_node->op_node(subgraph op, int8->any, with
+// input_data_scales).
+// case 3: prev_op(any->int8, with output_scale)->var_node->op_node(fp32->any,
+// without input_scale).
+// case 4: prev_op(any->int8, subgraph_op, with
+// output_data_scales)->var_node->op_node(fp32->any, without input_scale).
+static bool InferScale(Node* var_node, Node* op_node, float* scale) {
+  bool found = false;
+  auto& inst = op_node->AsStmt();
+  auto op_info = inst.op_info();
+  auto op_type = op_info->Type();
+  auto var_name = var_node->AsArg().name;
+  if (op_type == "subgraph") {
+    found = InferScaleFromSubgraph(var_name, op_info, scale, false);
+  } else {
+    if (op_info->HasAttr("input_scale")) {
+      *scale = op_info->GetAttr<float>("input_scale");
+      found = true;
+    } else {
+      // Obtain the output_scale from one of its previous Ops
+      auto prev_op_node = var_node->inlinks.front();
+      CHECK(prev_op_node->IsStmt());
+      auto& prev_inst = prev_op_node->AsStmt();
+      auto prev_op_info = prev_inst.op_info();
+      auto prev_op_type = prev_op_info->Type();
+      if (prev_op_type == "subgraph") {
+        found = InferScaleFromSubgraph(var_name, prev_op_info, scale, true);
+      } else {
+        if (prev_op_info->HasAttr("output_scale")) {
+          *scale = prev_op_info->GetAttr<float>("output_scale");
+          found = true;
+        }
+      }
+    }
+  }
+  return found;
+}
+
 void PrecisionCastPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   // Start from inputs of the graph, those should have place set.
   std::list<Node*> nodes;
@@ -59,6 +164,14 @@ void PrecisionCastPass::ComplementInputs(SSAGraph* graph,
   auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp);
   CHECK(in->AsArg().type);
   VLOG(4) << inst.picked_kernel().name();
+  if (inst.op_info()->Type() == "fetch") {
+    if (inst.op_info()->HasAttr("data_type")) {
+      auto data_type =
+          static_cast<PrecisionType>(inst.op_info()->GetAttr<int>("data_type"));
+      decl_arg_type = LiteType::GetTensorTy(
+          decl_arg_type->target(), data_type, decl_arg_type->layout());
+    }
+  }
   // if (!in->AsArg().is_weight && !PrecisionCompatibleTo(*in->AsArg().type,
   // *decl_arg_type)) {
   if (!PrecisionCompatibleTo(*in->AsArg().type, *decl_arg_type)) {
@@ -88,7 +201,8 @@ void PrecisionCastPass::AddCastInst(const Type& from,
   CHECK(in->IsArg());
   // auto node_id = [&] { return graph->nodes().size(); };
   auto cast_op_output_name = in->AsArg().name + "/precision_trans";
-  // in->AsArg().name + "/precision_trans/" + std::to_string(node_id());
+  // in->AsArg().name + "/precision_trans/" +
+  // paddle::lite::to_string(node_id());
   auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name);
   cast_op_output_arg->AsArg().type =
       LiteType::GetTensorTy(from.target(), to.precision(), from.layout());
@@ -109,10 +223,11 @@ void PrecisionCastPass::AddCastInst(const Type& from,
   op_desc.SetType(cast_type);
   op_desc.SetInput("Input", {in->AsArg().name});
   op_desc.SetOutput("Out", {cast_op_output_name});
-  if (inst_node->AsStmt().op_info()->HasAttr("input_scale")) {
-    op_desc.SetAttr(
-        "scale", inst_node->AsStmt().op_info()->GetAttr<float>("input_scale"));
+  float scale;
+  if (InferScale(in, inst_node, &scale)) {
+    op_desc.SetAttr("scale", scale);
   }
+
   cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
   auto kernels = cast_op->CreateKernels(valid_places);
   std::vector<std::unique_ptr<KernelBase>> selected_kernels;
@@ -146,9 +261,8 @@ void PrecisionCastPass::AddCastInst(const Type& from,
   DirectedLink(cast_op_output_arg, inst_node);
 
   // reset opdesc and update kernel information
-  UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(),
-                in->AsArg().name,
-                cast_op_output_name);
+  UpdateInputs(
+      inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name);
 
   // recreate the op
   auto original_selected_kernel =
@@ -178,5 +292,6 @@ void PrecisionCastPass::SetValidPlaces(const std::vector<Place>& valid_places) {
 REGISTER_MIR_PASS(type_precision_cast_pass,
                   paddle::lite::mir::PrecisionCastPass)
     .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kOpenCL)})
     .BindKernel("calib_once")
     .BindKernel("calib");
diff --git a/lite/core/mir/type_precision_cast_pass.h b/lite/core/mir/type_precision_cast_pass.h
index 3f55e52ef9fed1f0b456533141654d1dcadb16f7..b5f7c5d902a998e369f0b1775c59f50cbf8dc256 100644
--- a/lite/core/mir/type_precision_cast_pass.h
+++ b/lite/core/mir/type_precision_cast_pass.h
@@ -24,17 +24,7 @@ namespace paddle {
 namespace lite {
 namespace mir {
 
-static void UpdateInputTo(cpp::OpDesc* desc,
-                          const std::string& from,
-                          const std::string& to) {
-  for (auto& item : *desc->mutable_inputs()) {
-    for (auto& input : item.second) {
-      if (input == from) {
-        input = to;
-      }
-    }
-  }
-}
+void UpdateInputs(OpLite* op, const std::string& from, const std::string& to);
 
 /*
  * The pass complement the necessary instruction to make data
diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc
index ae74bd8d4d5647139a13509dfda0bb2b41ecc5c7..75d8022d5f5f9d8572a5e020c11ae5d8cf630c10 100644
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -21,6 +21,7 @@
 #include <vector>
 #include "lite/core/mir/graph_visualize_pass.h"
 #include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/type_precision_cast_pass.h"
 #include "lite/utils/string.h"
 
 namespace paddle {
@@ -240,9 +241,8 @@ void TypeTargetTransformPass::UpdateInstNode(Node* in,
                                              Node* inst_node,
                                              std::string io_copy_output_name) {
   // reset opdesc and update kernel information
-  UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(),
-                in->AsArg().name,
-                io_copy_output_name);
+  UpdateInputs(
+      inst_node->AsStmt().op().get(), in->AsArg().name, io_copy_output_name);
   auto original_selected_kernel =
       std::move(inst_node->AsStmt().kernels().front());
   auto update_op_info = *inst_node->AsStmt().op_info();
diff --git a/lite/core/mir/type_target_cast_pass.h b/lite/core/mir/type_target_cast_pass.h
index e9a275882f7c2cb813c1c0b8add5cc4ca89b0c8b..3561a0a7dd22709648450a4b8f3c8f3f11448b38 100644
--- a/lite/core/mir/type_target_cast_pass.h
+++ b/lite/core/mir/type_target_cast_pass.h
@@ -25,18 +25,6 @@ namespace paddle {
 namespace lite {
 namespace mir {
 
-static void UpdateInputTo(cpp::OpDesc* desc,
-                          const std::string& from,
-                          const std::string& to) {
-  for (auto& item : *desc->mutable_inputs()) {
-    for (auto& input : item.second) {
-      if (input == from) {
-        input = to;
-      }
-    }
-  }
-}
-
 /*
  * IoComplementPass complement the necessary instruction to make data
  * transferring or transformation between different places.
diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc
index 0936a44a66e4777633b84dadf0a1dc049213faab..a9ccd1b9ae9a5d45f8d0e5638b3aab1d73d1903c 100644
--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
@@ -22,6 +22,61 @@
 namespace paddle {
 namespace lite {
 
+bool OpLite::InferShape() {
+  // if input_tensor_ptrs and output_tensor_ptrs are overloaded in param_
+  // InferShapeByMemoryInternal will be applied.
+  if (param_.input_tensor_ptrs() && param_.output_tensor_ptrs()) {
+    return this->InferShapeWithCache();
+  } else {
+    // otherwise, InferShapeImpl is applied directly.
+    return this->InferShapeImpl();
+  }
+}
+bool OpLite::InferShapeWithCache() {
+  // 1. Get vector of current input tensors
+  auto *current_inputs = param_.input_tensor_ptrs();
+  // 2. Get hash value of current inputs shape and lod
+  size_t new_hash = 0;
+  for (auto iter = current_inputs->begin(); iter != current_inputs->end();
+       iter++) {
+    // combined dims value into new_hash value.
+    auto &element_dims = (*iter)->dims();
+    for (int i = 0; i < element_dims.size(); i++) {
+      new_hash =
+          lite::hash_combine(new_hash, static_cast<int>(element_dims[i]));
+    }
+    // combine lod value into new_hash valud.
+    auto &emement_lods = (*iter)->lod();
+    for (auto lod_iter = emement_lods.begin(); lod_iter != emement_lods.end();
+         lod_iter++) {
+      for (int i = 0; i < lod_iter->size(); i++) {
+        new_hash =
+            lite::hash_combine(new_hash, static_cast<int>(lod_iter->at(i)));
+      }
+    }
+  }
+  // 3. infer shapes of output tensors
+  if (new_hash == io_shape_lod_hash_ && new_hash != 0) {
+    // if current hash value is consistent with io_shape_lod_hash_,
+    // previous outputs shape and lod are reused.
+    auto *current_outputs = param_.output_tensor_ptrs();
+    for (int i = 0; i < current_outputs->size(); i++) {
+      current_outputs->at(i)->Resize(last_output_shapes[i]);
+      current_outputs->at(i)->set_lod(last_output_lods[i]);
+    }
+  } else {
+    // otherwise, current hash value is changed, InferShapeImpl will apply.
+    io_shape_lod_hash_ = new_hash;
+    this->InferShapeImpl();
+    auto *current_outputs = param_.output_tensor_ptrs();
+    for (int i = 0; i < current_outputs->size(); i++) {
+      last_output_shapes[i] = current_outputs->at(i)->dims();
+      last_output_lods[i] = current_outputs->at(i)->lod();
+    }
+  }
+  return true;
+}
+
 std::vector<std::unique_ptr<KernelBase>> OpLite::CreateKernels(
     const std::vector<Place> &places, const std::string &kernel_type) {
   std::vector<std::unique_ptr<KernelBase>> kernels;
@@ -47,18 +102,19 @@ std::vector<std::unique_ptr<KernelBase>> OpLite::CreateKernels(
     return kernels;
   }
 
-  std::set<Place> place_set;
-  for (auto place : places) {
-    place_set.insert(place);
-    // Pick kernels those support any Precision and any DataLayout
-    place.precision = PRECISION(kAny);
-    place_set.insert(place);
-    place.layout = DATALAYOUT(kAny);
-    place_set.insert(place);
+  std::set<Place> expanded_places(places.begin(), places.end());
+  for (auto &place : places) {
+    // Pick kernels those support any Precision and any DataLayout, For example:
+    // kARM,kFloat,kNCHW -> kARM,kFloat,kAny; kARM,kAny,kNCHW; kARM,kAny,kAny
+    expanded_places.insert(
+        Place(place.target, place.precision, DATALAYOUT(kAny)));
+    expanded_places.insert(Place(place.target, PRECISION(kAny), place.layout));
+    expanded_places.insert(
+        Place(place.target, PRECISION(kAny), DATALAYOUT(kAny)));
   }
 
   std::set<TargetType> targets;
-  for (auto place : place_set) {
+  for (auto place : expanded_places) {
     pick_kernel(place);
     targets.insert(place.target);
   }
diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h
index 5dec9ed7aace837e3eb085a55d7b9b5382f7dea3..4c6c66be7e41889c116aed023d863df8a4a912c8 100644
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <functional>
 #include <list>
 #include <map>
 #include <memory>
@@ -24,6 +25,7 @@
 #include "lite/core/kernel.h"
 #include "lite/core/scope.h"
 #include "lite/model_parser/cpp/op_desc.h"
+#include "lite/operators/op_params.h"
 
 namespace paddle {
 namespace lite {
@@ -64,7 +66,8 @@ class OpLite : public Registry {
   // Check the shape.
   virtual bool CheckShape() const { return true; }
   // Inference the outputs' shape.
-  virtual bool InferShape() const { return true; }
+  virtual bool InferShapeImpl() const { return true; }
+  virtual bool InferShape();
   // Run this operator.
   virtual bool Run();
   // Indicate whether the Op runs only once or not
@@ -150,6 +153,16 @@ class OpLite : public Registry {
   std::vector<Place> valid_places_;
   Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
   std::unique_ptr<OpInfo> op_info_;
+
+  std::vector<DDimLite> last_output_shapes{};
+  std::vector<std::vector<std::vector<uint64_t>>> last_output_lods{};
+  size_t io_shape_lod_hash_{};
+  mutable operators::ParamBase param_;
+
+ private:
+  // Infer Shape according to memory, if current input shapes are consistent
+  // with that of previous inputs, output shapes of last time will be reused.
+  bool InferShapeWithCache();
 };
 
 /*
diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc
index b49670eefb8b2c6aae30cb041de4d055a2b9964c..fe1dff3c99c1d2413888e78c89c999caea0ab030 100644
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -19,6 +19,10 @@
 namespace paddle {
 namespace lite {
 
+const std::map<std::string, std::string> &GetOp2PathDict() {
+  return OpKernelInfoCollector::Global().GetOp2PathDict();
+}
+
 std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
     const std::string &op_type,
     TargetType target,
@@ -103,6 +107,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
     case TARGET(kBM): {
       CREATE_KERNEL(kBM);
     } break;
+    case TARGET(kMLU): {
+      CREATE_KERNEL(kMLU);
+    } break;
     default:
       CHECK(false) << "not supported kernel target " << TargetToStr(target);
   }
@@ -135,6 +142,15 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kCUDA, kInt64, kNCHW);
   INIT_FOR(kCUDA, kInt64, kNHWC);
 
+  INIT_FOR(kMLU, kFloat, kNHWC);
+  INIT_FOR(kMLU, kFloat, kNCHW);
+  INIT_FOR(kMLU, kFP16, kNHWC);
+  INIT_FOR(kMLU, kFP16, kNCHW);
+  INIT_FOR(kMLU, kInt8, kNHWC);
+  INIT_FOR(kMLU, kInt8, kNCHW);
+  INIT_FOR(kMLU, kInt16, kNHWC);
+  INIT_FOR(kMLU, kInt16, kNCHW);
+
   INIT_FOR(kHost, kFloat, kNCHW);
   INIT_FOR(kHost, kAny, kNCHW);
   INIT_FOR(kHost, kFloat, kNHWC);
@@ -150,10 +166,13 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kX86, kInt64, kNCHW);
 
   INIT_FOR(kARM, kFloat, kNCHW);
+  INIT_FOR(kARM, kFloat, kNHWC);
   INIT_FOR(kARM, kInt8, kNCHW);
+  INIT_FOR(kARM, kInt8, kNHWC);
   INIT_FOR(kARM, kAny, kNCHW);
   INIT_FOR(kARM, kAny, kAny);
   INIT_FOR(kARM, kInt32, kNCHW);
+  INIT_FOR(kARM, kInt64, kNCHW);
 
   INIT_FOR(kOpenCL, kFloat, kNCHW);
   INIT_FOR(kOpenCL, kFloat, kNHWC);
@@ -175,8 +194,11 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kOpenCL, kAny, kImageNW);
 
   INIT_FOR(kNPU, kFloat, kNCHW);
+  INIT_FOR(kNPU, kFloat, kNHWC);
   INIT_FOR(kNPU, kInt8, kNCHW);
+  INIT_FOR(kNPU, kInt8, kNHWC);
   INIT_FOR(kNPU, kAny, kNCHW);
+  INIT_FOR(kNPU, kAny, kNHWC);
   INIT_FOR(kNPU, kAny, kAny);
 
   INIT_FOR(kXPU, kFloat, kNCHW);
diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h
index a49682eea68240bfa178eb3d3351b8c7fb41048d..3c41c1fd8af240401c3edf0343433f8d8d9c85db 100644
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -72,6 +72,8 @@ class OpKernelInfoCollector {
 namespace paddle {
 namespace lite {
 
+const std::map<std::string, std::string> &GetOp2PathDict();
+
 using KernelFunc = std::function<void()>;
 using KernelFuncCreator = std::function<std::unique_ptr<KernelFunc>()>;
 class LiteOpRegistry final : public Factory<OpLite, std::shared_ptr<OpLite>> {
@@ -145,6 +147,9 @@ class KernelRegistry final {
               KernelRegistryForTarget<TARGET(kARM),
                                       PRECISION(kInt8),
                                       DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kARM),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNCHW)> *,  //
               KernelRegistryForTarget<TARGET(kARM),
                                       PRECISION(kInt32),
                                       DATALAYOUT(kNCHW)> *,  //
@@ -263,7 +268,32 @@ class KernelRegistry final {
                                       DATALAYOUT(kAny)> *,  //
               KernelRegistryForTarget<TARGET(kFPGA),
                                       PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *  //
+                                      DATALAYOUT(kAny)> *,  //
+
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kInt16),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kMLU),
+                                      PRECISION(kInt16),
+                                      DATALAYOUT(kNCHW)> *  //
               >;
 
   KernelRegistry();
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index ddd94484ac4bb8d96d5c55300c985d21b44f1843..ca22c86907d4f582ef9d7ca84b908711ba1b8660 100644
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -53,7 +53,7 @@ class Optimizer {
     SpecifyKernelPickTactic(kernel_pick_factor);
     InitTargetTypeTransformPass();
 
-    if (passes.empty()) {
+    if (passes.empty() || passes.size() == 1) {
       std::vector<std::string> passes_local{
           {"lite_quant_dequant_fuse_pass",         //
            "weight_quantization_preprocess_pass",  //
@@ -75,6 +75,15 @@ class Optimizer {
     (defined LITE_WITH_ARM)
            "lite_elementwise_add_activation_fuse_pass",  //
 #endif
+           "quantized_op_attributes_inference_pass",  // Only for fully
+                                                      // quantized model, infer
+                                                      // the output scale and
+                                                      // fix the attribute
+                                                      // 'enable_int8' for all
+                                                      // of the quantized ops.
+           "npu_subgraph_pass",
+           "xpu_subgraph_pass",
+           "bm_subgraph_pass",
            "static_kernel_pick_pass",        // pick original kernel from graph
            "variable_place_inference_pass",  // inference arg/var's
            // info(target/precision/layout/device)
@@ -108,9 +117,10 @@ class Optimizer {
 
            "runtime_context_assign_pass",
            "argument_type_display_pass",
-           "memory_optimize_pass",
-           "npu_subgraph_pass",
-           "xpu_subgraph_pass"}};
+           "memory_optimize_pass"}};
+      if (passes.size() == 1) {
+        passes_local.push_back(passes[0]);
+      }
       RunPasses(passes_local);
     } else {
       RunPasses(passes);
diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h
index d9111e5c46c9217b181e5a3e5a8c7981f46250df..39213a33cebd05d9cfa50d82cdfb09ad3f7ad637 100644
--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
@@ -22,18 +22,25 @@
 #include <vector>
 #include "lite/core/program.h"
 
+#ifdef LITE_WITH_OPENCL
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/kernels/opencl/image_helper.h"
+#endif
+
 namespace paddle {
 namespace lite {
 namespace profile {
 
 template <typename dtype>
-static void write_tensorfile(const Tensor* tensor, const std::string& locate) {
+static bool write_tensorfile(const Tensor* tensor, const std::string& locate) {
   if (locate.find('/') != std::string::npos) {
-    return;
+    return false;
   }
   FILE* fp = fopen(locate.c_str(), "w");
   if (fp == nullptr) {
     LOG(ERROR) << "file open field " << locate;
+    return false;
   } else {
     const dtype* data = tensor->data<dtype>();
     for (int i = 0; i < tensor->numel(); ++i) {
@@ -41,63 +48,227 @@ static void write_tensorfile(const Tensor* tensor, const std::string& locate) {
     }
   }
   fclose(fp);
+  return true;
 }
 
 class PrecisionProfiler {
  public:
-  explicit PrecisionProfiler(const Instruction* inst) : inst_(inst) {}
-  ~PrecisionProfiler() {
-    LOG(INFO) << ">> Running kernel: " << inst_->op()->op_info()->Repr()
-              << " on Target " << TargetToStr(inst_->kernel()->target()) << " "
-              << PrecisionToStr(inst_->kernel()->precision());
-    auto tensor_mean = [](const Tensor* in,
-                          PrecisionType ptype,
-                          std::string name = "inst") -> double {
-      if (!in->data<int8_t>()) {
-        return -99999;
-      }
-      double sum = 0.;
-      switch (ptype) {
+  // TODO(ysh329): need to remove `explicit PrecisionProfiler`
+  // keep this method only for arm/math/conditional
+  explicit PrecisionProfiler(const Instruction* inst) {
+    std::string inst_precison_str = GetInstPrecision(inst);
+  }
+
+  PrecisionProfiler() {}
+
+  std::string GetSummaryHeader() {
+    using std::setw;
+    using std::left;
+    using std::fixed;
+    STL::stringstream ss;
+    ss << "========================================= "
+       << "Detailed Precision Profiler Summary "
+       << "=========================================" << std::endl;
+    ss << setw(45) << left << "operator:(kernel_info)"
+       << " " << setw(70) << left << "output_tensor_name:(tensor_info)"
+       << " " << setw(15) << left << "dims"
+       << " " << setw(15) << left << "mean"
+       << " " << setw(15) << left << "std_deviation"
+       << " " << setw(15) << left << "ave_grow_rate*" << std::endl;
+
+    return ss.str();
+  }
+
+  template <typename T>
+  double compute_mean(const T* in, const size_t length) {
+    double sum = 0.;
+    for (size_t i = 0; i < length; ++i) {
+      sum += in[i];
+    }
+    return sum / length;
+  }
+
+  template <typename T>
+  double compute_standard_deviation(const T* in,
+                                    const size_t length,
+                                    bool has_mean = false,
+                                    double mean = 10000) {
+    if (!has_mean) {
+      mean = compute_mean<T>(in, length);
+    }
+
+    double variance = 0.;
+    for (size_t i = 0; i < length; ++i) {
+      variance += pow((in[i] - mean), 2);
+    }
+    variance /= length;
+    return sqrt(variance);
+  }
+
+  template <typename T>
+  double compute_average_grow_rate(const T* in, const size_t length) {
+    const double eps = 1e-5;
+    double ave_grow_rate = 0.0f;
+    for (size_t i = 1; i < length; ++i) {
+      ave_grow_rate += (in[i] - in[i - 1]) / (in[i - 1] + eps);
+    }
+    ave_grow_rate /= length;
+    return ave_grow_rate;
+  }
+
+  // check if output tensor unused
+  bool is_unused(const Tensor* in) {
+    if (!in->data<int8_t>()) {
+      return true;
+    }
+    return false;
+  }
+
+  void compute_tensor_precision_info(const Tensor* in,
+                                     TargetType target_type,
+                                     PrecisionType precision_type,
+                                     DataLayoutType layout_type,
+                                     double* mean,
+                                     double* std_dev,
+                                     double* ave_grow_rate,
+                                     std::string name = "inst",
+                                     bool write_result_to_file = false) {
+    std::string unsupported_error_log =
+        "Unsupported precision profile for kernel registered on" +
+        TargetToStr(target_type) + "/" + PrecisionToStr(precision_type) + "/" +
+        DataLayoutToStr(layout_type);
+
+    if (target_type == TARGET(kARM) || target_type == TARGET(kHost) ||
+        target_type == TARGET(kX86)) {
+      switch (precision_type) {
         case PRECISION(kFloat): {
           auto ptr = in->data<float>();
-          // write_tensorfile<float>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
+          *mean = compute_mean<float>(ptr, in->numel());
+          *std_dev =
+              compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
+          write_result_to_file&& write_tensorfile<float>(in, name);
+          return;
         }
         case PRECISION(kAny): {
           auto ptr = in->data<float>();
-          // write_tensorfile<float>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
+          *mean = compute_mean<float>(ptr, in->numel());
+          *std_dev =
+              compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
+          write_result_to_file&& write_tensorfile<float>(in, name);
+          return;
         }
         case PRECISION(kInt8): {
           auto ptr = in->data<int8_t>();
-          // write_tensorfile<int8_t>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
+          *mean = compute_mean<int8_t>(ptr, in->numel());
+          *std_dev =
+              compute_standard_deviation<int8_t>(ptr, in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<int8_t>(ptr, in->numel());
+          write_result_to_file&& write_tensorfile<int8_t>(in, name);
+          return;
         }
         case PRECISION(kInt32): {
           auto ptr = in->data<int32_t>();
-          // write_tensorfile<int32_t>(in, name);
-          for (int i = 0; i < in->numel(); ++i) {
-            sum += ptr[i];
-          }
-          return sum / in->numel();
+          *mean = compute_mean<int32_t>(ptr, in->numel());
+          *std_dev = compute_standard_deviation<int32_t>(
+              ptr, in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<int32_t>(ptr, in->numel());
+          write_result_to_file&& write_tensorfile<int32_t>(in, name);
+          return;
         }
         default:
-          LOG(INFO) << "unsupport data type: " << PrecisionToStr(ptype);
-          return 0.;
+          *mean = -333333333333;
+          *std_dev = -33333333333;
+          *ave_grow_rate = -33333333333;
+          LOG(ERROR) << unsupported_error_log;
+          return;
       }
-    };
-    if (inst_->op()->op_info()->Type() != "fetch") {
-      auto op = const_cast<lite::OpLite*>(inst_->op());
-      auto kernel = inst_->kernel();
+#ifdef LITE_WITH_OPENCL
+    } else if (target_type == TARGET(kOpenCL)) {
+      switch (layout_type) {
+        case DATALAYOUT(kImageDefault): {
+          paddle::lite::CLImageConverterDefault default_convertor;
+          auto image_shape = default_convertor.InitImageDimInfoWith(in->dims());
+          size_t im_w = image_shape[0];
+          size_t im_h = image_shape[1];
+          VLOG(1) << "image shape(W,H) of " << name << ": " << im_w << " "
+                  << im_h;
+          std::vector<uint16_t> in_data_v(im_w * im_h * 4);
+          std::vector<float> real_out_v(in->numel());
+          const size_t cl_image2d_row_pitch{0};
+          const size_t cl_image2d_slice_pitch{0};
+          TargetWrapperCL::ImgcpySync(in_data_v.data(),
+                                      in->data<uint16_t, cl::Image2D>(),
+                                      im_w,
+                                      im_h,
+                                      cl_image2d_row_pitch,
+                                      cl_image2d_slice_pitch,
+                                      IoDirection::DtoH);
+          default_convertor.ImageToNCHW(
+              in_data_v.data(), real_out_v.data(), image_shape, in->dims());
+          CHECK(real_out_v.size() == in->numel());
+          *mean = compute_mean<float>(real_out_v.data(), real_out_v.size());
+          *std_dev = compute_standard_deviation<float>(
+              real_out_v.data(), in->numel(), true, *mean);
+          *ave_grow_rate = compute_average_grow_rate<float>(real_out_v.data(),
+                                                            real_out_v.size());
+          write_result_to_file&& write_tensorfile<float>(in, name);
+          return;
+        }
+        case DATALAYOUT(kNCHW): {
+          std::vector<float> in_data_v(in->numel(), 0);
+          TargetWrapperCL::MemcpySync(in_data_v.data(),
+                                      in->data<float>(),
+                                      in->numel() * sizeof(float),
+                                      IoDirection::DtoH);
+          VLOG(1) << name << ":" << in->numel();
+          *mean = compute_mean<float>(in_data_v.data(), in->numel());
+          *std_dev = compute_standard_deviation<float>(
+              in_data_v.data(), in->numel(), true, *mean);
+          *ave_grow_rate =
+              compute_average_grow_rate<float>(in_data_v.data(), in->numel());
+          write_result_to_file&& write_tensorfile<float>(in, name);
+          return;
+        }
+        default:
+          *mean = -222222222222;
+          *std_dev = -22222222222;
+          *ave_grow_rate = -22222222222;
+          LOG(ERROR) << unsupported_error_log;
+          return;
+      }
+#endif
+    } else {
+      *mean = -111111111111;
+      *std_dev = -11111111111;
+      *ave_grow_rate = -11111111111;
+      LOG(ERROR) << unsupported_error_log;
+      return;
+    }
+  }
+
+  std::string GetInstPrecision(const Instruction* inst = nullptr) {
+    using std::setw;
+    using std::left;
+    using std::fixed;
+    STL::stringstream ss;
+    bool write_result_to_file = false;
+
+    VLOG(1) << ">> Running kernel: " << inst->op()->op_info()->Repr()
+            << " registered on " << TargetToStr(inst->kernel()->target()) << "/"
+            << PrecisionToStr(inst->kernel()->precision()) << "/"
+            << DataLayoutToStr(inst->kernel()->layout());
+
+    std::string kernel_repr = inst->op()->op_info()->Repr();
+    std::string kernel_place = TargetToStr(inst->kernel()->target()) + "/" +
+                               PrecisionToStr(inst->kernel()->precision()) +
+                               "/" + DataLayoutToStr(inst->kernel()->layout());
+    std::string op_name = inst->op()->op_info()->Type();
+
+    if (inst->op()->op_info()->Type() != "fetch") {
+      auto op = const_cast<lite::OpLite*>(inst->op());
+      auto kernel = inst->kernel();
       auto op_scope = op->scope();
       auto out_names = op->op_info()->output_names();
       for (auto& out_name : out_names) {
@@ -106,32 +277,90 @@ class PrecisionProfiler {
         auto type = kernel->GetOutputDeclType(out_arg_name);
 
         if (type->IsTensor()) {
-          auto tout = op_scope->FindVar(out_name)->GetMutable<Tensor>();
-          double mean = tensor_mean(tout, type->precision(), out_name);
-          LOG(INFO) << "output name: " << out_name << ", dims: " << tout->dims()
-                    << ", precision: " << PrecisionToStr(type->precision())
-                    << ", mean value: " << mean << " shape:" << tout->dims();
+          const Tensor* tout =
+              op_scope->FindVar(out_name)->GetMutable<Tensor>();
+          double mean = -999999;
+          double std_dev = -100000;
+          double ave_grow_rate = 99999;
+          std::string mean_str{"unused"};
+          std::string std_dev_str{"unused"};
+          std::string ave_grow_rate_str{"unused"};
+
+          if (!is_unused(tout)) {
+            compute_tensor_precision_info(tout,
+                                          type->target(),
+                                          type->precision(),
+                                          type->layout(),
+                                          &mean,
+                                          &std_dev,
+                                          &ave_grow_rate,
+                                          out_name,
+                                          write_result_to_file);
+            mean_str = std::to_string(mean);
+            std_dev_str = std::to_string(std_dev);
+            ave_grow_rate_str = std::to_string(ave_grow_rate);
+          }
+          std::string kernel_info = op_name + ":" + kernel_place;
+          std::string output_arg_info = out_name + ":" +
+                                        TargetToStr(type->target()) + "/" +
+                                        PrecisionToStr(type->precision()) +
+                                        "/" + DataLayoutToStr(type->layout());
+
+          ss << setw(45) << left << kernel_info << " " << setw(70) << left
+             << output_arg_info << " " << setw(15) << left << tout->dims()
+             << " " << setw(15) << left << mean_str << " " << setw(15) << left
+             << std_dev_str << " " << setw(15) << left << ave_grow_rate_str
+             << std::endl;
         } else if (type->IsTensorList()) {
-          auto tout =
+          auto touts =
               op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
-          for (auto& t : *tout) {
-            double mean = tensor_mean(&t, type->precision(), out_name);
-            LOG(INFO) << "output name: " << out_name << ", dims: " << t.dims()
-                      << ", precision: " << PrecisionToStr(type->precision())
-                      << ", mean value: " << mean;
+          for (auto t : *touts) {
+            const Tensor* tout = &t;
+            double mean = -999999;
+            double std_dev = -100000;
+            double ave_grow_rate = 99999;
+            std::string mean_str{"unused"};
+            std::string std_dev_str{"unused"};
+            std::string ave_grow_rate_str{"unused"};
+
+            if (!is_unused(tout)) {
+              compute_tensor_precision_info(tout,
+                                            type->target(),
+                                            type->precision(),
+                                            type->layout(),
+                                            &mean,
+                                            &std_dev,
+                                            &ave_grow_rate,
+                                            out_name,
+                                            write_result_to_file);
+              mean_str = std::to_string(mean);
+              std_dev_str = std::to_string(std_dev);
+              ave_grow_rate_str = std::to_string(ave_grow_rate);
+            }
+            std::string kernel_info = op_name + ":" + kernel_place;
+            std::string output_arg_info = out_name + ":" +
+                                          TargetToStr(type->target()) + "/" +
+                                          PrecisionToStr(type->precision()) +
+                                          "/" + DataLayoutToStr(type->layout());
+
+            ss << setw(45) << left << kernel_info << " " << setw(70) << left
+               << output_arg_info << " " << setw(15) << left << tout->dims()
+               << " " << setw(15) << left << mean_str << " " << setw(15) << left
+               << std_dev_str << " " << setw(15) << left << ave_grow_rate_str
+               << std::endl;
           }
         }
       }
     }
+    return ss.str();
   }
-
- private:
-  const Instruction* inst_{nullptr};
 };
 
 }  // namespace profile
 }  // namespace lite
 }  // namespace paddle
 
+// TODO(ysh329): need to remove.
+// keep this method only for arm/math/conditional_block_compute
 #define LITE_PRECISION_PROFILE(inst) \
   { auto a = paddle::lite::profile::PrecisionProfiler(&inst); }
diff --git a/lite/core/program.cc b/lite/core/program.cc
index 0895643a6adde0095f9d2892c41f263eedd4284f..7284c3983cb34a0db2387ece40f6d07b9d9a8511 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -136,6 +136,14 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
 }
 
 void RuntimeProgram::Run() {
+#ifdef LITE_WITH_PROFILE
+#ifdef LITE_WITH_PRECISION_PROFILE
+  auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler();
+  std::string precision_profiler_summary =
+      inst_precision_profiler.GetSummaryHeader();
+#endif
+#endif
+
   for (auto& inst : instructions_) {
 #ifndef LITE_WITH_FPGA
     if (inst.is_feed_fetch_op()) continue;
@@ -144,13 +152,17 @@ void RuntimeProgram::Run() {
 #ifdef LITE_WITH_PROFILE
 #ifdef LITE_WITH_PRECISION_PROFILE
 #ifndef LITE_WITH_FPGA
-    LITE_PRECISION_PROFILE(inst)
+    precision_profiler_summary +=
+        inst_precision_profiler.GetInstPrecision(&inst);
 #endif
 #endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
   }
 #ifdef LITE_WITH_PROFILE
   LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
+#ifdef LITE_WITH_PRECISION_PROFILE
+  LOG(INFO) << "\n" << precision_profiler_summary;
+#endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
 }
 
diff --git a/lite/core/program_fake_utils.h b/lite/core/program_fake_utils.h
index edcbb101aa5ddb090cc585a16597967cb5114936..fbee253872237bce08f3f67b948da79becbae21a 100644
--- a/lite/core/program_fake_utils.h
+++ b/lite/core/program_fake_utils.h
@@ -30,9 +30,9 @@ Program FakeProgram() {
 
   auto add_fc = [&](int id, std::string x) {
     // create variables
-    std::string w1 = "w" + std::to_string(id);
-    std::string b1 = "b" + std::to_string(id);
-    std::string out1 = "out" + std::to_string(id);
+    std::string w1 = "w" + paddle::lite::to_string(id);
+    std::string b1 = "b" + paddle::lite::to_string(id);
+    std::string out1 = "out" + paddle::lite::to_string(id);
     auto w1v = program.scope()->Var(w1)->GetMutable<lite::Tensor>();
     auto b1v = program.scope()->Var(b1)->GetMutable<lite::Tensor>();
     auto out1v = program.scope()->Var(out1)->GetMutable<lite::Tensor>();
diff --git a/lite/core/tensor.cc b/lite/core/tensor.cc
index 38a6be6767eae62f9d91c9c11811bc49639331bf..ecb9935dfd13c09cbd1a20f3833e6ab76161192a 100644
--- a/lite/core/tensor.cc
+++ b/lite/core/tensor.cc
@@ -75,6 +75,7 @@ void TensorLite::ShareDataWith(const TensorLite &other) {
   target_ = other.target_;
   lod_ = other.lod_;
   memory_size_ = other.memory_size_;
+  precision_ = other.precision_;
 }
 
 void TensorLite::CopyDataFrom(const TensorLite &other) {
@@ -82,6 +83,7 @@ void TensorLite::CopyDataFrom(const TensorLite &other) {
   target_ = other.target_;
   lod_ = other.lod_;
   memory_size_ = other.memory_size_;
+  precision_ = other.precision_;
   buffer_->CopyDataFrom(*other.buffer_, memory_size_);
 }
 
@@ -96,6 +98,21 @@ void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
   return mutable_data(memory_size);
 }
 
+void TensorLite::ResetBuffer(std::shared_ptr<Buffer> buffer,
+                             size_t memory_size) {
+  CHECK_EQ(offset_, 0)
+      << "Only the offset is supported to zero when the Buffer is reset.";
+  if (buffer_) {
+    CHECK_LE(memory_size_, buffer->space())
+        << "The space of buffer is not enough to store the tensor.";
+    CHECK_LE(memory_size, buffer->space())
+        << "The buffer is smaller than the specified minimum size.";
+  }
+  buffer_ = buffer;
+  memory_size_ = memory_size;
+  target_ = buffer->target();
+}
+
 #ifdef LITE_WITH_OPENCL
 template <>
 const cl::Image2D *TensorLite::data<float, cl::Image2D>() const {
@@ -103,8 +120,8 @@ const cl::Image2D *TensorLite::data<float, cl::Image2D>() const {
   return static_cast<const cl::Image2D *>(buffer_->data());
 }
 
-template <>  // use int16_t represent half float
-const cl::Image2D *TensorLite::data<int16_t, cl::Image2D>() const {
+template <>  // use uint16_t represent half float
+const cl::Image2D *TensorLite::data<uint16_t, cl::Image2D>() const {
   if (nullptr == buffer_->data()) return nullptr;
   return static_cast<const cl::Image2D *>(buffer_->data());
 }
diff --git a/lite/core/tensor.h b/lite/core/tensor.h
index 04e540002b553a0e0f7db0144fd970bdb6a4d9ed..2209e524f413b4cedf255566bfc1b6b1f1229f8d 100644
--- a/lite/core/tensor.h
+++ b/lite/core/tensor.h
@@ -102,9 +102,10 @@ using LoD = std::vector<std::vector<uint64_t>>;
 class TensorLite {
  public:
   TensorLite() : buffer_(std::make_shared<Buffer>()) {}
+  explicit TensorLite(std::shared_ptr<Buffer> buffer) : buffer_(buffer) {}
 
   template <typename DType, typename DimT, TargetType Target>
-  void Assign(DType *data, const DimT &dim) {
+  void Assign(const DType *data, const DimT &dim) {
     Resize(dim);
     auto *dst = mutable_data<DType, void>(Target);
     CopySync<Target>(
@@ -178,6 +179,11 @@ class TensorLite {
         (static_cast<char *>(buffer_->data()) + offset_));
   }
 
+  void *raw_data() {
+    return static_cast<char *>(
+        (static_cast<char *>(buffer_->data()) + offset_));
+  }
+
   void clear() {
     buffer_->Free();
     offset_ = 0;
@@ -195,6 +201,8 @@ class TensorLite {
 
   void CopyDataFrom(const TensorLite &other);
 
+  void ResetBuffer(std::shared_ptr<Buffer> buffer, size_t memory_size);
+
   TargetType target() const { return target_; }
 
   template <typename T>
@@ -260,8 +268,8 @@ bool TensorCompareWith(const TensorT &a, const TensorT &b) {
 template <>
 const cl::Image2D *TensorLite::data<float, cl::Image2D>() const;
 
-template <>  // use int16_t represent half float
-const cl::Image2D *TensorLite::data<int16_t, cl::Image2D>() const;
+template <>  // use uint16_t represent half float
+const cl::Image2D *TensorLite::data<uint16_t, cl::Image2D>() const;
 #endif
 
 }  // namespace lite
diff --git a/lite/core/type_system.h b/lite/core/type_system.h
index aeddf965c3b999750c7cca3595cc9f669b32d50e..2cf8366a2a1cbb6eb0c5f4e3dff3e4ac2623ff66 100644
--- a/lite/core/type_system.h
+++ b/lite/core/type_system.h
@@ -177,8 +177,9 @@ static bool TargetCompatibleTo(const Type& a, const Type& b) {
     return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM);
   };
   if (a.IsVoid() || b.IsVoid()) return true;
-  if (a.IsTensor() || b.IsTensor()) {
-    if (a.IsTensor() && b.IsTensor()) {
+  if (a.IsTensor() || b.IsTensor() || a.IsTensorList() || b.IsTensorList()) {
+    if ((a.IsTensor() && b.IsTensor()) ||
+        (a.IsTensorList() && b.IsTensorList())) {
       return is_host(a.target()) ? is_host(b.target())
                                  : a.target() == b.target();
     }
diff --git a/lite/core/version.h.in b/lite/core/version.h.in
index d34c32073b852a50b5d26984ed4812ac4f38a870..da2d5f3ed99631973d97a94741e1711391237261 100644
--- a/lite/core/version.h.in
+++ b/lite/core/version.h.in
@@ -53,9 +53,9 @@ static std::string version() {
 static int64_t int_version(const std::string& version) {
   const std::vector<std::string> vec = Split(version, ".");
   if (vec.size() == 3) {
-    return std::stoi(vec[0]) * MAJOR_COEFF +
-           std::stoi(vec[1]) * MINOR_COEFF +
-           std::stoi(vec[2]) * PATCH_COEFF;
+    return atoi(vec[0].c_str()) * MAJOR_COEFF +
+           atoi(vec[1].c_str()) * MINOR_COEFF +
+           atoi(vec[2].c_str()) * PATCH_COEFF;
   }
   return -1;
 }
diff --git a/lite/demo/cxx/README.md b/lite/demo/cxx/README.md
index 447bcbaff018d15a1bc3075c1153f724672f40a8..c2bdb25f4e3b46265bcc4830b613b6d0d6d8232d 100644
--- a/lite/demo/cxx/README.md
+++ b/lite/demo/cxx/README.md
@@ -8,12 +8,42 @@
 
 2. 人脸识别和佩戴口罩判断的Demo
 
+目前，PaddleLite提供了shell端的人脸识别和佩戴口罩判断的Demo，首先基于已经准备好的Demo进行演示，然后介绍如何基于代码编译Demo并执行。
+
+**下载Demo并执行**
+
+下载压缩包[mask_detection_files](https://paddle-inference-dist.cdn.bcebos.com/PaddleLiteDemo/mask_detection_files.tgz)，解压到本地，其中包括编译好的可执行文件、模型文件、测试图片、PaddleLite 2.3版本动态库。
+
+电脑连接安卓手机，在电脑shell端执行如下命令，将mask_detection_files文件夹push到安卓手机上。
+```
+adb push mask_detection_files /data/local/tmp/
+```
+
+在电脑shell端执行如下命令，进入安卓手机，执行demo。
+```
+adb shell
+cd /data/local/tmp/mask_detection_files
+export LD_LIBRARY_PATH=/data/local/tmp/mask_detection_files:$LD_LIBRARY_PATH 
+./mask_detection face_detection mask_classification test.jpg
+```
+
+回到电脑端，将结果图片（test_mask_detection_result.jpg）取出，查看检测结果。
+```
+exit
+adb pull /data/local/tmp/mask_detection_files/test_mask_detection_result.jpg ./
+```
+
+
+**编译Demo并执行**
+
 参考[源码编译](https://paddlepaddle.github.io/Paddle-Lite/v2.2.0/source_compile/)准备编译环境。
 
-执行下面命令，下载PaddleLite代码。
+执行下面命令，下载PaddleLite代码，切换到2.3版本分支。
 ```shell
 git clone https://github.com/PaddlePaddle/Paddle-Lite.git
 cd Paddle-Lite
+git fetch origin release/v2.3:release/v2.3 
+git checkout release/v2.3
 ```
 
 进入PaddleLite根目录，编译预测库。
@@ -25,7 +55,7 @@ cd Paddle-Lite
     --android_stl=c++_static \
     --build_extra=ON \
     --shutdown_log=OFF \
-    tiny_publish
+    full_publish
 ```
 
 进入编译目录，下载模型和图片的压缩包，编译可执行文件。
@@ -70,7 +100,11 @@ export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH
 adb pull /data/local/tmp/test_mask_detection_result.jpg ./
 ```
 
-![test_mask_detection_result](https://user-images.githubusercontent.com/7383104/74279176-6200cd00-4d55-11ea-9fc0-83cfc2b3b37d.jpg)
+![test_mask_detection_result](https://user-images.githubusercontent.com/7383104/75131866-bae64300-570f-11ea-9cad-17acfaea1cfc.jpg)
+
+注：mask_detetion.cc 中的缩放因子shrink, 检测阈值detect_threshold, 可供自由配置:
+   - 缩放因子越大，模型运行速度越慢，检测准确率越高。
+   - 检测阈值越高，人脸筛选越严格，检测出的人脸框可能越少。
 
 3. 编译并运行全量api的demo(注：当编译模式为tiny_pubish时将不存在该demo)
 ```shell
diff --git a/lite/demo/cxx/cuda_demo/CMakeLists.txt b/lite/demo/cxx/cuda_demo/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e27548b4e56ce03098c5c82b3eee49add62cc0a4
--- /dev/null
+++ b/lite/demo/cxx/cuda_demo/CMakeLists.txt
@@ -0,0 +1,20 @@
+project(demo CXX C)
+cmake_minimum_required(VERSION 2.8)
+
+set(TARGET demo)
+set(CMAKE_CXX_FLAGS "-std=c++11 -O3")
+
+set(LITE_LIB "${PROJECT_SOURCE_DIR}/../../cxx")
+set(PROTOBUF_LIB "${PROJECT_SOURCE_DIR}/../../third_party/protobuf")
+
+include_directories("${LITE_LIB}/include")
+link_directories("${LITE_LIB}/lib")
+link_directories("${PROTOBUF_LIB}/lib")
+
+add_executable(${TARGET} ${TARGET}.cc)
+
+set(DEPS ${LITE_LIB}/lib/libpaddle_full_api_shared.so)
+set(DEPS ${DEPS} protobuf-lite)
+set(DEPS ${DEPS} "-lrt -lpthread -ldl")
+
+target_link_libraries(${TARGET} ${DEPS})
diff --git a/lite/demo/cxx/cuda_demo/demo.cc b/lite/demo/cxx/cuda_demo/demo.cc
new file mode 100644
index 0000000000000000000000000000000000000000..593e73cf83cd491fd8e33e415d17106dc8f4ce14
--- /dev/null
+++ b/lite/demo/cxx/cuda_demo/demo.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+#include "paddle_api.h"  // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+void RunModel(std::string model_dir) {
+  // 1. Create CxxConfig
+  CxxConfig config;
+  config.set_model_file(model_dir + "/__model__");
+  config.set_param_file(model_dir + "/__params__");
+  config.set_valid_places({
+      Place{TARGET(kCUDA), PRECISION(kFloat)},
+  });
+  // 2. Create PaddlePredictor by CxxConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<CxxConfig>(config);
+
+  // 3. Prepare input data
+  int num = 1;
+  int channels = 3;
+  int height = 608;
+  int width = 608;
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize({num, channels, height, width});
+  // fake input data
+  std::vector<float> data(num * channels * height * width, 0);
+  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+    data[i] = i % 10 * 0.1;
+  }
+  input_tensor->CopyFromCpu<float, TargetType::kCUDA>(data.data());
+  std::unique_ptr<Tensor> size_tensor(std::move(predictor->GetInput(1)));
+  size_tensor->Resize({1, 2});
+  std::vector<int> size_data{608, 608};
+  size_tensor->CopyFromCpu<int, TargetType::kCUDA>(size_data.data());
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  std::vector<float> out_cpu(ShapeProduction(output_tensor->shape()), 0);
+  std::cout << "output size is " << ShapeProduction(output_tensor->shape())
+            << std::endl;
+  output_tensor->CopyToCpu(out_cpu.data());
+  for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
+    std::cout << "Output[" << i << "]: " << out_cpu[i] << std::endl;
+  }
+}
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    std::cerr << "[ERROR] usage: ./" << argv[0] << " model_dir\n";
+    exit(1);
+  }
+  std::string model_dir = argv[1];
+  RunModel(model_dir);
+  return 0;
+}
diff --git a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7
index d659a316cd856fd550e83b125573409f239b8cf2..4a63563c4ff12b825e881327ec77adc5b2f03aeb 100644
--- a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7
@@ -28,7 +28,7 @@ OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
 
 CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
 
-CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
+#CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
 
 ###############################################################
 # How to use one of static libaray:                           #
@@ -40,7 +40,7 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYS
 # 1. Comment above line using `libpaddle_light_api_shared.so`
 # 2. Undo comment below line using `libpaddle_api_light_bundled.a`
 
-#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
 
 test_model_cv: fetch_opencv test_model_cv.o
 	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv  $(CXX_LIBS) $(LDFLAGS)
diff --git a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8
index c80b07d5c029a3624a514e07375fd08e8770da25..70d6bed52b84be7d050ef15ab483e8d06342c82d 100644
--- a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8
@@ -28,7 +28,7 @@ OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
 
 CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
 
-CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
+#CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
 ###############################################################
 # How to use one of static libaray:                           #
 #  `libpaddle_api_full_bundled.a`                             #
@@ -39,7 +39,7 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYS
 # 1. Comment above line using `libpaddle_light_api_shared.so`
 # 2. Undo comment below line using `libpaddle_api_light_bundled.a`
 
-#CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
 
 test_model_cv: fetch_opencv test_model_cv.o
 	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv  $(CXX_LIBS) $(LDFLAGS)
diff --git a/lite/demo/cxx/mask_detection/mask_detection.cc b/lite/demo/cxx/mask_detection/mask_detection.cc
index 748b84365fc70aa59171a6bf8847f554308fdc8c..09a9c0ee158e7d5913a78877711d831fc5738cf1 100644
--- a/lite/demo/cxx/mask_detection/mask_detection.cc
+++ b/lite/demo/cxx/mask_detection/mask_detection.cc
@@ -81,6 +81,29 @@ void neon_mean_scale(const float* din,
   }
 }
 
+cv::Mat crop_img(const cv::Mat& img,
+                 cv::Rect rec,
+                 int res_width,
+                 int res_height) {
+  float xmin = rec.x;
+  float ymin = rec.y;
+  float w = rec.width;
+  float h = rec.height;
+  float center_x = xmin + w / 2;
+  float center_y = ymin + h / 2;
+  cv::Point2f center(center_x, center_y);
+  float max_wh = std::max(w / 2, h / 2);
+  float scale = res_width / (2 * max_wh * 1.5);
+  cv::Mat rot_mat = cv::getRotationMatrix2D(center, 0.f, scale);
+  rot_mat.at<double>(0, 2) =
+      rot_mat.at<double>(0, 2) - (center_x - res_width / 2.0);
+  rot_mat.at<double>(1, 2) =
+      rot_mat.at<double>(1, 2) - (center_y - res_width / 2.0);
+  cv::Mat affine_img;
+  cv::warpAffine(img, affine_img, rot_mat, cv::Size(res_width, res_height));
+  return affine_img;
+}
+
 void pre_process(const cv::Mat& img,
                  int width,
                  int height,
@@ -89,8 +112,12 @@ void pre_process(const cv::Mat& img,
                  float* data,
                  bool is_scale = false) {
   cv::Mat resized_img;
-  cv::resize(
-      img, resized_img, cv::Size(width, height), 0.f, 0.f, cv::INTER_CUBIC);
+  if (img.cols != width || img.rows != height) {
+    cv::resize(
+        img, resized_img, cv::Size(width, height), 0.f, 0.f, cv::INTER_CUBIC);
+  } else {
+    resized_img = img;
+  }
   cv::Mat imgf;
   float scale_factor = is_scale ? 1.f / 256 : 1.f;
   resized_img.convertTo(imgf, CV_32FC3, scale_factor);
@@ -98,12 +125,12 @@ void pre_process(const cv::Mat& img,
   neon_mean_scale(dimg, data, width * height, mean, scale);
 }
 
-void RunModel(std::string det_model_dir,
-              std::string class_model_dir,
+void RunModel(std::string det_model_file,
+              std::string class_model_file,
               std::string img_path) {
   // Prepare
   cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
-  float shrink = 0.2;
+  float shrink = 0.4;
   int width = img.cols;
   int height = img.rows;
   int s_width = static_cast<int>(width * shrink);
@@ -111,11 +138,12 @@ void RunModel(std::string det_model_dir,
 
   // Detection
   MobileConfig config;
-  config.set_model_dir(det_model_dir);
+  config.set_model_from_file(det_model_file);
 
   // Create Predictor For Detction Model
   std::shared_ptr<PaddlePredictor> predictor =
       CreatePaddlePredictor<MobileConfig>(config);
+  std::cout << "Load detecion model succeed." << std::endl;
 
   // Get Input Tensor
   std::unique_ptr<Tensor> input_tensor0(std::move(predictor->GetInput(0)));
@@ -136,9 +164,10 @@ void RunModel(std::string det_model_dir,
   auto* outptr = output_tensor0->data<float>();
   auto shape_out = output_tensor0->shape();
   int64_t out_len = ShapeProduction(shape_out);
+  std::cout << "Detecting face succeed." << std::endl;
 
   // Filter Out Detection Box
-  float detect_threshold = 0.3;
+  float detect_threshold = 0.7;
   std::vector<Object> detect_result;
   for (int i = 0; i < out_len / 6; ++i) {
     if (outptr[1] >= detect_threshold) {
@@ -158,10 +187,11 @@ void RunModel(std::string det_model_dir,
   }
 
   // Classification
-  config.set_model_dir(class_model_dir);
+  config.set_model_from_file(class_model_file);
 
   // Create Predictor For Classification Model
   predictor = CreatePaddlePredictor<MobileConfig>(config);
+  std::cout << "Load classification model succeed." << std::endl;
 
   // Get Input Tensor
   std::unique_ptr<Tensor> input_tensor1(std::move(predictor->GetInput(0)));
@@ -172,10 +202,14 @@ void RunModel(std::string det_model_dir,
   int detect_num = detect_result.size();
   std::vector<float> classify_mean = {0.5f, 0.5f, 0.5f};
   std::vector<float> classify_scale = {1.f, 1.f, 1.f};
-  float classify_threshold = 0.5;
   for (int i = 0; i < detect_num; ++i) {
     cv::Rect rec_clip = detect_result[i].rec;
-    cv::Mat roi = img(rec_clip);
+    cv::Mat roi = crop_img(img, rec_clip, classify_w, classify_h);
+
+    // uncomment two lines below, save roi img to disk
+    // std::string roi_name = "roi_" + paddle::lite::to_string(i)
+    // + ".jpg";
+    // imwrite(roi_name, roi);
 
     // Do PreProcess
     pre_process(roi,
@@ -191,37 +225,60 @@ void RunModel(std::string det_model_dir,
 
     // Get Output Tensor
     std::unique_ptr<const Tensor> output_tensor1(
-        std::move(predictor->GetOutput(1)));
+        std::move(predictor->GetOutput(0)));
     auto* outptr = output_tensor1->data<float>();
+    float prob = outptr[1];
 
     // Draw Detection and Classification Results
-    cv::rectangle(img, rec_clip, cv::Scalar(0, 0, 255), 2, cv::LINE_AA);
-    std::string text = outptr[1] > classify_threshold ? "wear mask" : "no mask";
-    int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
-    double font_scale = 1.f;
-    int thickness = 1;
+    bool flag_mask = prob > 0.5f;
+    cv::Scalar roi_color;
+    std::string text;
+    if (flag_mask) {
+      text = "MASK:  ";
+      roi_color = cv::Scalar(0, 255, 0);
+    } else {
+      text = "NO MASK:  ";
+      roi_color = cv::Scalar(0, 0, 255);
+      prob = 1 - prob;
+    }
+    std::string prob_str = std::to_string(prob * 100);
+    int point_idx = prob_str.find_last_of(".");
+
+    text += prob_str.substr(0, point_idx + 3) + "%";
+    int font_face = cv::FONT_HERSHEY_SIMPLEX;
+    double font_scale = 0.25;
+    float thickness = 1;
     cv::Size text_size =
         cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
-    float new_font_scale = rec_clip.width * 0.7 * font_scale / text_size.width;
-    text_size =
-        cv::getTextSize(text, font_face, new_font_scale, thickness, nullptr);
+
+    int top_space = std::max(0.35 * text_size.height, 2.0);
+    int bottom_space = top_space + 2;
+    int right_space = 0.05 * text_size.width;
+    int back_width = text_size.width + right_space;
+    int back_height = text_size.height + top_space + bottom_space;
+
+    // Configure text background
+    cv::Rect text_back =
+        cv::Rect(rec_clip.x, rec_clip.y - back_height, back_width, back_height);
+
+    // Draw roi object, text, and background
+    cv::rectangle(img, rec_clip, roi_color, 1);
+    cv::rectangle(img, text_back, cv::Scalar(225, 225, 225), -1);
     cv::Point origin;
-    origin.x = rec_clip.x + 5;
-    origin.y = rec_clip.y + text_size.height + 5;
+    origin.x = rec_clip.x;
+    origin.y = rec_clip.y - bottom_space;
     cv::putText(img,
                 text,
                 origin,
                 font_face,
-                new_font_scale,
-                cv::Scalar(0, 255, 255),
-                thickness,
-                cv::LINE_AA);
+                font_scale,
+                cv::Scalar(0, 0, 0),
+                thickness);
 
     std::cout << "detect face, location: x=" << rec_clip.x
               << ", y=" << rec_clip.y << ", width=" << rec_clip.width
-              << ", height=" << rec_clip.height
-              << ", wear mask: " << (outptr[1] > classify_threshold)
-              << std::endl;
+              << ", height=" << rec_clip.height << ", wear mask: " << flag_mask
+              << ", prob: " << prob << std::endl;
   }
 
   // Write Result to Image File
@@ -230,17 +287,19 @@ void RunModel(std::string det_model_dir,
   std::string img_name = img_path.substr(start + 1, end - start - 1);
   std::string result_name = img_name + "_mask_detection_result.jpg";
   cv::imwrite(result_name, img);
+  std::cout << "write result to file: " << result_name << ", success."
+            << std::endl;
 }
 
 int main(int argc, char** argv) {
   if (argc < 3) {
     std::cerr << "[ERROR] usage: " << argv[0]
-              << " detction_model_dir classification_model_dir image_path\n";
+              << " detction_model_file classification_model_file image_path\n";
     exit(1);
   }
-  std::string detect_model_dir = argv[1];
-  std::string classify_model_dir = argv[2];
+  std::string detect_model_file = argv[1];
+  std::string classify_model_file = argv[2];
   std::string img_path = argv[3];
-  RunModel(detect_model_dir, classify_model_dir, img_path);
+  RunModel(detect_model_file, classify_model_file, img_path);
   return 0;
 }
diff --git a/lite/demo/cxx/mobile_classify/mobile_classify.cc b/lite/demo/cxx/mobile_classify/mobile_classify.cc
index d0cf59e185e1330b7d8487d562afa0af29236007..518040ebd07bb4e8940f6a885cddd4f3c98143f3 100644
--- a/lite/demo/cxx/mobile_classify/mobile_classify.cc
+++ b/lite/demo/cxx/mobile_classify/mobile_classify.cc
@@ -126,7 +126,7 @@ void pre_process(const cv::Mat& img,
   neon_mean_scale(dimg, data, width * height, means, scales);
 }
 
-void RunModel(std::string model_dir,
+void RunModel(std::string model_file,
               std::string img_path,
               const std::vector<std::string>& labels,
               const int topk,
@@ -134,7 +134,7 @@ void RunModel(std::string model_dir,
               int height) {
   // 1. Set MobileConfig
   MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_file);
 
   // 2. Create PaddlePredictor by MobileConfig
   std::shared_ptr<PaddlePredictor> predictor =
@@ -169,12 +169,12 @@ void RunModel(std::string model_dir,
 int main(int argc, char** argv) {
   if (argc < 4) {
     std::cerr << "[ERROR] usage: " << argv[0]
-              << " model_dir image_path label_file\n";
+              << " model_file image_path label_file\n";
     exit(1);
   }
-  printf("parameter:  model_dir, image_path and label_file are necessary \n");
+  printf("parameter:  model_file, image_path and label_file are necessary \n");
   printf("parameter:  topk, input_width,  input_height, are optional \n");
-  std::string model_dir = argv[1];
+  std::string model_file = argv[1];
   std::string img_path = argv[2];
   std::string label_file = argv[3];
   std::vector<std::string> labels;
@@ -190,6 +190,6 @@ int main(int argc, char** argv) {
     height = atoi(argv[6]);
   }
 
-  RunModel(model_dir, img_path, labels, topk, width, height);
+  RunModel(model_file, img_path, labels, topk, width, height);
   return 0;
 }
diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
index 9d923cb87da5244e4550be3fb6936a650ec9b53a..150bcd231c27c25d8510fc8dfa3281a8351514dd 100644
--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
@@ -12,8 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <sys/time.h>
+#include <time.h>
+#include <cmath>
 #include <iostream>
+#include <string>
 #include <vector>
+
 #include "paddle_api.h"  // NOLINT
 
 using namespace paddle::lite_api;  // NOLINT
@@ -24,13 +29,57 @@ int64_t ShapeProduction(const shape_t& shape) {
   return res;
 }
 
-void RunModel(std::string model_dir) {
+std::string ShapePrint(const shape_t& shape) {
+  std::string shape_str{""};
+  for (auto i : shape) {
+    shape_str += std::to_string(i) + " ";
+  }
+  return shape_str;
+}
+
+template <typename T>
+double compute_mean(const T* in, const size_t length) {
+  double sum = 0.;
+  for (size_t i = 0; i < length; ++i) {
+    sum += in[i];
+  }
+  return sum / length;
+}
+
+template <typename T>
+double compute_standard_deviation(const T* in,
+                                  const size_t length,
+                                  bool has_mean = false,
+                                  double mean = 10000) {
+  if (!has_mean) {
+    mean = compute_mean<T>(in, length);
+  }
+
+  double variance = 0.;
+  for (size_t i = 0; i < length; ++i) {
+    variance += pow((in[i] - mean), 2);
+  }
+  variance /= length;
+  return sqrt(variance);
+}
+
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+
+void RunModel(std::string model_dir,
+              const shape_t& input_shape,
+              int repeats,
+              int warmup,
+              int print_output_elem) {
   // 1. Set MobileConfig
   MobileConfig config;
-  config.set_model_dir(model_dir);
-  // To load model transformed by opt after release/v2.3.0, plese use
-  // `set_model_from_file` listed below.
-  // config.set_model_from_file(model_dir);
+  config.set_model_from_file(model_dir);
+  // NOTE: To load model transformed by model_optimize_tool before
+  // release/v2.3.0, plese use `set_model_dir` API as listed below.
+  // config.set_model_dir(model_dir);
 
   // 2. Create PaddlePredictor by MobileConfig
   std::shared_ptr<PaddlePredictor> predictor =
@@ -38,31 +87,108 @@ void RunModel(std::string model_dir) {
 
   // 3. Prepare input data
   std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
-  input_tensor->Resize({1, 3, 224, 224});
+  input_tensor->Resize(
+      {input_shape[0], input_shape[1], input_shape[2], input_shape[3]});
   auto* data = input_tensor->mutable_data<float>();
   for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
     data[i] = 1;
   }
 
   // 4. Run predictor
-  predictor->Run();
+  for (size_t widx = 0; widx < warmup; ++widx) {
+    predictor->Run();
+  }
+
+  double sum_duration = 0.0;  // millisecond;
+  double max_duration = 1e-5;
+  double min_duration = 1e5;
+  double avg_duration = -1;
+  for (size_t ridx = 0; ridx < repeats; ++ridx) {
+    auto start = GetCurrentUS();
+
+    predictor->Run();
+
+    auto duration = (GetCurrentUS() - start) / 1000.0;
+    sum_duration += duration;
+    max_duration = duration > max_duration ? duration : max_duration;
+    min_duration = duration < min_duration ? duration : min_duration;
+    std::cout << "run_idx:" << ridx + 1 << " / " << repeats << ": " << duration
+              << " ms" << std::endl;
+  }
+  avg_duration = sum_duration / static_cast<float>(repeats);
+  std::cout << "\n======= benchmark summary =======\n"
+            << "input_shape(NCHW):" << ShapePrint(input_shape) << "\n"
+            << "model_dir:" << model_dir << "\n"
+            << "warmup:" << warmup << "\n"
+            << "repeats:" << repeats << "\n"
+            << "max_duration:" << max_duration << "\n"
+            << "min_duration:" << min_duration << "\n"
+            << "avg_duration:" << avg_duration << "\n";
 
   // 5. Get output
-  std::unique_ptr<const Tensor> output_tensor(
-      std::move(predictor->GetOutput(0)));
-  std::cout << "Output shape " << output_tensor->shape()[1] << std::endl;
-  for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
-    std::cout << "Output[" << i << "]: " << output_tensor->data<float>()[i]
+  std::cout << "\n====== output summary ====== " << std::endl;
+  size_t output_tensor_num = predictor->GetOutputNames().size();
+  std::cout << "output tensor num:" << output_tensor_num << std::endl;
+
+  for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
+    std::unique_ptr<const paddle::lite_api::Tensor> output_tensor =
+        predictor->GetOutput(tidx);
+    std::cout << "\n--- output tensor " << tidx << " ---" << std::endl;
+    auto out_shape = output_tensor->shape();
+    auto out_data = output_tensor->data<float>();
+    auto out_mean = compute_mean<float>(out_data, ShapeProduction(out_shape));
+    auto out_std_dev = compute_standard_deviation<float>(
+        out_data, ShapeProduction(out_shape), true, out_mean);
+
+    std::cout << "output shape(NCHW):" << ShapePrint(out_shape) << std::endl;
+    std::cout << "output tensor " << tidx
+              << " elem num:" << ShapeProduction(out_shape) << std::endl;
+    std::cout << "output tensor " << tidx
+              << " standard deviation:" << out_std_dev << std::endl;
+    std::cout << "output tensor " << tidx << " mean value:" << out_mean
               << std::endl;
+
+    // print output
+    if (print_output_elem) {
+      for (int i = 0; i < ShapeProduction(out_shape); ++i) {
+        std::cout << "out[" << tidx << "][" << i
+                  << "]:" << output_tensor->data<float>()[i] << std::endl;
+      }
+    }
   }
 }
 
 int main(int argc, char** argv) {
-  if (argc < 2) {
-    std::cerr << "[ERROR] usage: ./" << argv[0] << " naive_buffer_model_dir\n";
-    exit(1);
+  shape_t input_shape{1, 3, 224, 224};  // shape_t ==> std::vector<int64_t>
+  int repeats = 10;
+  int warmup = 10;
+  int print_output_elem = 0;
+
+  if (argc > 2 && argc < 9) {
+    std::cerr << "usage: ./" << argv[0] << "\n"
+              << "  <naive_buffer_model_dir>\n"
+              << "  <input_n>\n"
+              << "  <input_c>\n"
+              << "  <input_h>\n"
+              << "  <input_w>\n"
+              << "  <repeats>\n"
+              << "  <warmup>\n"
+              << "  <print_output>" << std::endl;
+    return 0;
   }
+
   std::string model_dir = argv[1];
-  RunModel(model_dir);
+  if (argc >= 9) {
+    input_shape[0] = atoi(argv[2]);
+    input_shape[1] = atoi(argv[3]);
+    input_shape[2] = atoi(argv[4]);
+    input_shape[3] = atoi(argv[5]);
+    repeats = atoi(argv[6]);
+    warmup = atoi(argv[7]);
+    print_output_elem = atoi(argv[8]);
+  }
+
+  RunModel(model_dir, input_shape, repeats, warmup, print_output_elem);
+
   return 0;
 }
diff --git a/lite/demo/cxx/ssd_detection/ssd_detection.cc b/lite/demo/cxx/ssd_detection/ssd_detection.cc
index 2408afcbf64a24924eca119a9d9481dc030250c9..0be4561cd8d083f26e562c2346da217bb4b48283 100644
--- a/lite/demo/cxx/ssd_detection/ssd_detection.cc
+++ b/lite/demo/cxx/ssd_detection/ssd_detection.cc
@@ -162,10 +162,10 @@ std::vector<Object> detect_object(const float* data,
   return rect_out;
 }
 
-void RunModel(std::string model_dir, std::string img_path) {
+void RunModel(std::string model_file, std::string img_path) {
   // 1. Set MobileConfig
   MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_file);
 
   // 2. Create PaddlePredictor by MobileConfig
   std::shared_ptr<PaddlePredictor> predictor =
@@ -199,11 +199,11 @@ void RunModel(std::string model_dir, std::string img_path) {
 
 int main(int argc, char** argv) {
   if (argc < 3) {
-    std::cerr << "[ERROR] usage: " << argv[0] << " model_dir image_path\n";
+    std::cerr << "[ERROR] usage: " << argv[0] << " model_file image_path\n";
     exit(1);
   }
-  std::string model_dir = argv[1];
+  std::string model_file = argv[1];
   std::string img_path = argv[2];
-  RunModel(model_dir, img_path);
+  RunModel(model_file, img_path);
   return 0;
 }
diff --git a/lite/demo/cxx/test_cv/README.md b/lite/demo/cxx/test_cv/README.md
index 36d2985a4fd4f243027f8caab9b6c5a8beb94cad..21574a9bf9fd0ebb3ecf1663f49beed93fdf51bb 100644
--- a/lite/demo/cxx/test_cv/README.md
+++ b/lite/demo/cxx/test_cv/README.md
@@ -1,5 +1,5 @@
 # 图像预测库的使用
-1. 下载源码（https://github.com/PaddlePaddle/Paddle-Lite），打开LITE_WITH_CV=ON，编译full_publish模式
+1. 下载源码（https://github.com/PaddlePaddle/Paddle-Lite），打开LITE_WITH_CV=ON，编译full_publish or tiny_publish模式
 example:
 ```shell
 set BUILD_WITH_CV=ON or LITE_WITH_CV=ON
@@ -8,7 +8,7 @@ set BUILD_WITH_CV=ON or LITE_WITH_CV=ON
 --arm_abi=armv8
 --arm_lang=gcc
 --android_stl=c++_static
-full_publish
+tiny_publish
 ```
 
 2. 准备模型和优化模型
@@ -17,7 +17,7 @@ example:
 wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
 tar zxvf mobilenet_v1.tar.gz
 ./lite/tools/build.sh build_optimize_tool
-./build.model_optimize_tool/lite/api/model_optimize_tool 
+./build.opt/lite/api/opt
 --optimize_out_type=naive_buffer 
 --optimize_out=model_dir 
 --model_dir=model_dir
@@ -68,7 +68,8 @@ make
 adb -s device_id push mobilenet_v1 /data/local/tmp/
 adb -s device_id push test_model_cv /data/local/tmp/
 adb -s device_id push test.jpg /data/local/tmp/
-adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
+adb -s device_id push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+#adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
 adb -s device_id shell chmod +x /data/local/tmp/test_model_cv
 adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
 /data/local/tmp/test_model_cv /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg 1 3 224 224 "
@@ -119,7 +120,8 @@ make
 adb -s device_id push mobilenet_v1 /data/local/tmp/
 adb -s device_id push test_img_propress /data/local/tmp/
 adb -s device_id push test.jpg /data/local/tmp/
-adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
+adb -s device_id push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+#adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
 adb -s device_id shell chmod +x /data/local/tmp/test_model_cv
 adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
 /data/local/tmp/test_img_propress /data/local/tmp/test.jpg /data/local/tmp/ 3 3 1 3 224 224 /data/local/tmp/mobilenet_v1  "
diff --git a/lite/demo/cxx/test_cv/test_img_prepross.cc b/lite/demo/cxx/test_cv/test_img_prepross.cc
index c2cbd66cc0a15a1032141641d83fbf8db85d20bf..3115ba8f0bf1459541d067d466b80c12548f36a8 100644
--- a/lite/demo/cxx/test_cv/test_img_prepross.cc
+++ b/lite/demo/cxx/test_cv/test_img_prepross.cc
@@ -28,362 +28,874 @@ typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess;
 typedef paddle::lite_api::DataLayoutType LayoutType;
 using namespace paddle::lite_api;  // NOLINT
 
-void fill_with_mat(cv::Mat& mat, uint8_t* src) {  // NOLINT
+void fill_with_mat(cv::Mat& mat, uint8_t* src, int num) {  // NOLINT
   for (int i = 0; i < mat.rows; i++) {
     for (int j = 0; j < mat.cols; j++) {
-      int tmp = (i * mat.cols + j) * 3;
-      cv::Vec3b& rgb = mat.at<cv::Vec3b>(i, j);
-      rgb[0] = src[tmp];
-      rgb[1] = src[tmp + 1];
-      rgb[2] = src[tmp + 2];
+      if (num == 1) {
+        int tmp = (i * mat.cols + j);
+      } else if (num == 2) {
+        int tmp = (i * mat.cols + j) * 2;
+        cv::Vec2b& rgb = mat.at<cv::Vec2b>(i, j);
+        rgb[0] = src[tmp];
+        rgb[1] = src[tmp + 1];
+        rgb[2] = src[tmp + 2];
+      } else if (num == 3) {
+        int tmp = (i * mat.cols + j) * 3;
+        cv::Vec3b& rgb = mat.at<cv::Vec3b>(i, j);
+        rgb[0] = src[tmp];
+        rgb[1] = src[tmp + 1];
+        rgb[2] = src[tmp + 2];
+      } else if (num == 4) {
+        int tmp = (i * mat.cols + j) * 4;
+        cv::Vec4b& rgb = mat.at<cv::Vec4b>(i, j);
+        rgb[0] = src[tmp];
+        rgb[1] = src[tmp + 1];
+        rgb[2] = src[tmp + 2];
+        rgb[3] = src[tmp + 3];
+      } else {
+        std::cout << "it is not support" << std::endl;
+        return;
+      }
     }
   }
 }
-void test_img(std::vector<int> cluster_id,
-              std::vector<int> thread_num,
-              std::string img_path,
-              std::string dst_path,
-              ImageFormat srcFormat,
-              ImageFormat dstFormat,
-              int width,
-              int height,
-              float rotate,
-              FlipParam flip,
-              LayoutType layout,
-              std::string model_dir,
-              int test_iter = 1) {
-  // init
-  // paddle::lite::DeviceInfo::Init();
-  // read img and pre-process
-  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
-  float means[3] = {0.485f, 0.456f, 0.406f};
-  float scales[3] = {0.229f, 0.224f, 0.225f};
-  int srch = img.rows;
-  int srcw = img.cols;
-  for (auto& cls : cluster_id) {
-    for (auto& th : thread_num) {
-      std::cout << "cluster: " << cls << ", threads: " << th << std::endl;
-      // 1. Set MobileConfig
-      MobileConfig config;
-      config.set_model_dir(model_dir);
-      config.set_power_mode((PowerMode)cls);
-      config.set_threads(th);
-      std::cout << "model: " << model_dir;
-
-      // 2. Create PaddlePredictor by MobileConfig
-      std::shared_ptr<PaddlePredictor> predictor =
-          CreatePaddlePredictor<MobileConfig>(config);
 
-      // 3. Prepare input data from image
-      std::unique_ptr<Tensor> input_tensor(predictor->GetInput(0));
+double compare_diff(uint8_t* data1, uint8_t* data2, int size, uint8_t* diff_v) {
+  double diff = 0.0;
+  for (int i = 0; i < size; i++) {
+    double val = abs(data1[i] - data2[i]);
+    diff_v[i] = val;
+    diff = val > diff ? val : diff;
+  }
+  return diff;
+}
+void print_data(const uint8_t* data, int size) {
+  for (int i = 0; i < size; i++) {
+    printf("%d ", data[i]);
+    if ((i + 1) % 10 == 0) {
+      std::cout << std::endl;
+    }
+  }
+  std::cout << std::endl;
+}
+bool test_convert(bool cv_run,
+                  const uint8_t* src,
+                  cv::Mat img,
+                  ImagePreprocess image_preprocess,
+                  int in_size,
+                  int out_size,
+                  ImageFormat srcFormat,
+                  ImageFormat dstFormat,
+                  int dsth,
+                  int dstw,
+                  std::string dst_path,
+                  int test_iter = 1) {
+  // out
+  uint8_t* resize_cv = new uint8_t[out_size];
+  uint8_t* resize_lite = new uint8_t[out_size];
+  cv::Mat im_resize;
 
-      /*
-        imread(img_path, param)
-        IMREAD_UNCHANGED(<0) 表示加载原图，不做任何改变
-        IMREAD_GRAYSCALE ( 0)表示把原图作为灰度图像加载进来
-        IMREAD_COLOR (>0) 表示把原图作为RGB图像加载进来
-      */
-      cv::Mat img;
-      if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
-        img = imread(img_path, cv::IMREAD_COLOR);
-      } else if (srcFormat == ImageFormat::GRAY) {
-        img = imread(img_path, cv::IMREAD_GRAYSCALE);
-      } else {
-        printf("this format %d does not support \n", srcFormat);
-        return;
-      }
-      if (img.empty()) {
-        std::cout << "opencv read image " << img_path.c_str() << " failed"
-                  << std::endl;
-        return;
+  double to_cv = 0.0;
+  double to_lite = 0.0;
+  std::cout << "opencv compute:" << std::endl;
+  if (cv_run) {
+    for (int i = 0; i < test_iter; i++) {
+      clock_t begin = clock();
+      // convert bgr-gray
+      if (dstFormat == srcFormat) {
+        im_resize = img;
+      } else if ((dstFormat == ImageFormat::BGR ||
+                  dstFormat == ImageFormat::RGB) &&
+                 srcFormat == ImageFormat::GRAY) {
+        cv::cvtColor(img, im_resize, cv::COLOR_GRAY2BGR);
+      } else if ((srcFormat == ImageFormat::BGR ||
+                  dstFormat == ImageFormat::RGBA) &&
+                 dstFormat == ImageFormat::GRAY) {
+        cv::cvtColor(img, im_resize, cv::COLOR_BGR2GRAY);
+      } else if (dstFormat == srcFormat) {
+        printf("convert format error \n");
+        return false;
       }
-      int srch = img.rows;
-      int srcw = img.cols;
-      int dsth = height;
-      int dstw = width;
+      clock_t end = clock();
+      to_cv += (end - begin);
+    }
+  }
 
-      std::cout << " input tensor size, num= " << 1 << ", channel= " << 1
-                << ", height= " << srch << ", width= " << srcw
-                << ", srcFormat= " << (ImageFormat)srcFormat << std::endl;
-      // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12,
-      if (srcFormat == ImageFormat::GRAY) {
-        std::cout << "srcFormat: GRAY" << std::endl;
-      }
-      if (srcFormat == ImageFormat::BGR) {
-        std::cout << "srcFormat: BGR" << std::endl;
-      }
-      if (srcFormat == ImageFormat::RGB) {
-        std::cout << "srcFormat: RGB" << std::endl;
-      }
-      std::cout << " output tensor size, num=" << 1 << ", channel=" << 1
-                << ", height=" << dsth << ", width=" << dstw
-                << ", dstFormat= " << (ImageFormat)dstFormat << std::endl;
+  std::cout << "lite compute:" << std::endl;
+  for (int i = 0; i < test_iter; i++) {
+    clock_t begin = clock();
+    // resize default linear
+    image_preprocess.imageConvert(src, resize_lite);
+    clock_t end = clock();
+    to_lite += (end - begin);
+  }
+  to_cv = 1000 * to_cv / CLOCKS_PER_SEC;
+  to_lite = 1000 * to_lite / CLOCKS_PER_SEC;
 
-      if (dstFormat == ImageFormat::GRAY) {
-        std::cout << "dstFormat: GRAY" << std::endl;
-      }
-      if (dstFormat == ImageFormat::BGR) {
-        std::cout << "dstFormat: BGR" << std::endl;
-      }
-      if (dstFormat == ImageFormat::RGB) {
-        std::cout << "dstFormat: RGB" << std::endl;
+  std::cout << "---opencv convert run time: " << to_cv
+            << "ms, avg: " << to_cv / test_iter << std::endl;
+  std::cout << "---lite convert run time: " << to_lite
+            << "ms, avg: " << to_lite / test_iter << std::endl;
+  std::cout << "compare diff: " << std::endl;
+
+  if (cv_run) {
+    resize_cv = im_resize.data;
+    uint8_t* diff_v = new uint8_t[out_size];
+    double diff = compare_diff(resize_cv, resize_lite, out_size, diff_v);
+    if (diff > 1) {
+      std::cout << "din: " << std::endl;
+      print_data(src, in_size);
+      std::cout << "cv out: " << std::endl;
+      print_data(resize_cv, out_size);
+      std::cout << "lite out: " << std::endl;
+      print_data(resize_lite, out_size);
+      std::cout << "lite out: " << std::endl;
+      print_data(diff_v, out_size);
+      return false;
+    } else {
+      // save_img
+      std::cout << "write image: " << std::endl;
+      std::string resize_name = dst_path + "/convert.jpg";
+      cv::Mat resize_mat;
+      int num = 1;
+      if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) {
+        resize_mat = cv::Mat(dsth, dstw, CV_8UC3);
+        num = 3;
+      } else if (dstFormat == ImageFormat::BGRA ||
+                 dstFormat == ImageFormat::RGBA) {
+        resize_mat = cv::Mat(dsth, dstw, CV_8UC4);
+        num = 4;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        resize_mat = cv::Mat(dsth, dstw, CV_8UC1);
+        num = 1;
+      } else if (dstFormat == ImageFormat::NV12) {
+        resize_mat = cv::Mat(dsth, dstw, CV_8UC2);
+        num = 2;
       }
+      fill_with_mat(resize_mat, resize_lite, num);
+      cv::imwrite(resize_name, resize_mat);
+
+      std::cout << "convert successed!" << std::endl;
+      return true;
+    }
+  }
+}
+
+bool test_flip(bool cv_run,
+               const uint8_t* src,
+               cv::Mat img,
+               ImagePreprocess image_preprocess,
+               int in_size,
+               int out_size,
+               FlipParam flip,
+               ImageFormat dstFormat,
+               int dsth,
+               int dstw,
+               std::string dst_path,
+               int test_iter = 1) {
+  // out
+  uint8_t* resize_cv = new uint8_t[out_size];
+  uint8_t* resize_lite = new uint8_t[out_size];
+  cv::Mat im_resize;
 
-      std::cout << "Rotate = " << rotate << ", Flip = " << flip
-                << ", Layout = " << static_cast<int>(layout) << std::endl;
-      if (static_cast<int>(layout) != 1 && static_cast<int>(layout) != 3) {
-        std::cout << "this layout" << static_cast<int>(layout)
-                  << " is no support" << std::endl;
+  double to_cv = 0.0;
+  double to_lite = 0.0;
+  std::cout << "opencv compute:" << std::endl;
+  if (cv_run) {
+    for (int i = 0; i < test_iter; i++) {
+      clock_t begin = clock();
+      // resize default linear
+      cv::flip(img, im_resize, flip);
+      clock_t end = clock();
+      to_cv += (end - begin);
+    }
+  }
+  std::cout << "lite compute:" << std::endl;
+  for (int i = 0; i < test_iter; i++) {
+    clock_t begin = clock();
+    // resize default linear
+    image_preprocess.imageFlip(src, resize_lite);
+    clock_t end = clock();
+    to_lite += (end - begin);
+  }
+  to_cv = 1000 * to_cv / CLOCKS_PER_SEC;
+  to_lite = 1000 * to_lite / CLOCKS_PER_SEC;
+
+  std::cout << "---opencv flip run time: " << to_cv
+            << "ms, avg: " << to_cv / test_iter << std::endl;
+  std::cout << "---lite flip run time: " << to_lite
+            << "ms, avg: " << to_lite / test_iter << std::endl;
+  std::cout << "compare diff: " << std::endl;
+
+  if (cv_run) {
+    resize_cv = im_resize.data;
+    uint8_t* diff_v = new uint8_t[out_size];
+    double diff = compare_diff(resize_cv, resize_lite, out_size, diff_v);
+    if (diff > 1) {
+      std::cout << "din: " << std::endl;
+      print_data(src, in_size);
+      std::cout << "cv out: " << std::endl;
+      print_data(resize_cv, out_size);
+      std::cout << "lite out: " << std::endl;
+      print_data(resize_lite, out_size);
+      std::cout << "diff out: " << std::endl;
+      print_data(diff_v, out_size);
+      return false;
+    } else {
+      // save_img
+      std::cout << "write image: " << std::endl;
+      std::string resize_name = dst_path + "/flip.jpg";
+      cv::Mat resize_mat;
+      int num = 1;
+      if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) {
+        resize_mat = cv::Mat(dsth, dstw, CV_8UC3);
+        num = 3;
+      } else if (dstFormat == ImageFormat::BGRA ||
+                 dstFormat == ImageFormat::RGBA) {
+        resize_mat = cv::Mat(dsth, dstw, CV_8UC4);
+        num = 4;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        resize_mat = cv::Mat(dsth, dstw, CV_8UC1);
+        num = 1;
+      } else if (dstFormat == ImageFormat::NV12) {
+        resize_mat = cv::Mat(dsth, dstw, CV_8UC2);
+        num = 2;
       }
-      int size = 3 * srch * srcw;
-      if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
-        size = 3 * srch * srcw;
-      } else if (srcFormat == ImageFormat::GRAY) {
-        size = srch * srcw;
+      fill_with_mat(resize_mat, resize_lite, num);
+      cv::imwrite(resize_name, resize_mat);
+      std::cout << "flip successed!" << std::endl;
+      return true;
+    }
+  }
+}
+
+bool test_rotate(bool cv_run,
+                 const uint8_t* src,
+                 cv::Mat img,
+                 ImagePreprocess image_preprocess,
+                 int in_size,
+                 int out_size,
+                 float rotate,
+                 ImageFormat dstFormat,
+                 int dsth,
+                 int dstw,
+                 std::string dst_path,
+                 int test_iter = 1) {
+  // out
+  uint8_t* resize_cv = new uint8_t[out_size];
+  uint8_t* resize_lite = new uint8_t[out_size];
+  cv::Mat im_resize;
+
+  double to_cv = 0.0;
+  double to_lite = 0.0;
+  std::cout << "opencv compute:" << std::endl;
+  if (cv_run) {
+    for (int i = 0; i < test_iter; i++) {
+      clock_t begin = clock();
+      // rotate 90
+      if (rotate == 90) {
+        cv::flip(img.t(), im_resize, 1);
+      } else if (rotate == 180) {
+        cv::flip(img, im_resize, -1);
+      } else if (rotate == 270) {
+        cv::flip(img.t(), im_resize, 0);
       }
-      uint8_t* src = img.data;
+      clock_t end = clock();
+      to_cv += (end - begin);
+    }
+  }
+  // lite
+  std::cout << "lite compute:" << std::endl;
+  for (int i = 0; i < test_iter; i++) {
+    clock_t begin = clock();
+    // resize default linear
+    image_preprocess.imageRotate(src, resize_lite);
+    clock_t end = clock();
+    to_lite += (end - begin);
+  }
+  to_cv = 1000 * to_cv / CLOCKS_PER_SEC;
+  to_lite = 1000 * to_lite / CLOCKS_PER_SEC;
 
-      int out_size = srch * srcw;
-      int resize = dstw * dsth;
+  std::cout << "---opencv rotate run time: " << to_cv
+            << "ms, avg: " << to_cv / test_iter << std::endl;
+  std::cout << "---lite rotate run time: " << to_lite
+            << "ms, avg: " << to_lite / test_iter << std::endl;
+  std::cout << "compare diff: " << std::endl;
+  if (cv_run) {
+    resize_cv = im_resize.data;
+    uint8_t* diff_v = new uint8_t[out_size];
+    double diff = compare_diff(resize_cv, resize_lite, out_size, diff_v);
+    if (diff > 1) {
+      std::cout << "din: " << std::endl;
+      print_data(src, in_size);
+      std::cout << "cv out: " << std::endl;
+      print_data(resize_cv, out_size);
+      std::cout << "lite out: " << std::endl;
+      print_data(resize_lite, out_size);
+      std::cout << "diff out: " << std::endl;
+      print_data(diff_v, out_size);
+      return false;
+    } else {
+      // save_img
+      std::cout << "write image: " << std::endl;
+      std::string resize_name = dst_path + "/rotate.jpg";
+      cv::Mat resize_mat;
+      int num = 1;
       if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) {
-        out_size = 3 * srch * srcw;
-        resize = 3 * dsth * dstw;
+        resize_mat = cv::Mat(dsth, dstw, CV_8UC3);
+        num = 3;
+      } else if (dstFormat == ImageFormat::BGRA ||
+                 dstFormat == ImageFormat::RGBA) {
+        resize_mat = cv::Mat(dsth, dstw, CV_8UC4);
+        num = 4;
       } else if (dstFormat == ImageFormat::GRAY) {
-        out_size = srch * srcw;
-        resize = dsth * dstw;
+        resize_mat = cv::Mat(dsth, dstw, CV_8UC1);
+        num = 1;
+      } else if (dstFormat == ImageFormat::NV12) {
+        resize_mat = cv::Mat(dsth, dstw, CV_8UC2);
+        num = 2;
       }
-      // out
-      uint8_t* lite_dst = new uint8_t[out_size];
-      uint8_t* resize_tmp = new uint8_t[resize];
-      uint8_t* tv_out_ratote = new uint8_t[out_size];
-      uint8_t* tv_out_flip = new uint8_t[out_size];
-      std::vector<int64_t> shape_out = {1, 3, srch, srcw};
+      fill_with_mat(resize_mat, resize_lite, num);
+      cv::imwrite(resize_name, resize_mat);
+      std::cout << "rotate successed!" << std::endl;
+      return true;
+    }
+  }
+}
 
-      input_tensor->Resize(shape_out);
-      Tensor dst_tensor = *input_tensor;
-      std::cout << "opencv compute" << std::endl;
-      cv::Mat im_convert;
-      cv::Mat im_resize;
-      cv::Mat im_rotate;
-      cv::Mat im_flip;
-      double to_1 = 0;
-      double to_2 = 0;
-      double to_3 = 0;
-      double to_4 = 0;
-      double to1 = 0;
-      for (int i = 0; i < test_iter; i++) {
-        clock_t start = clock();
-        clock_t begin = clock();
-        // convert bgr-gray
-        if (dstFormat == srcFormat) {
-          im_convert = img;
-        } else if (dstFormat == ImageFormat::BGR &&
-                   srcFormat == ImageFormat::GRAY) {
-          cv::cvtColor(img, im_convert, cv::COLOR_GRAY2BGR);
-        } else if (srcFormat == ImageFormat::BGR &&
-                   dstFormat == ImageFormat::GRAY) {
-          cv::cvtColor(img, im_convert, cv::COLOR_BGR2GRAY);
-        } else if (dstFormat == srcFormat) {
-          printf("convert format error \n");
-          return;
-        }
-        clock_t end = clock();
-        to_1 += (end - begin);
+bool test_resize(bool cv_run,
+                 const uint8_t* src,
+                 cv::Mat img,
+                 ImagePreprocess image_preprocess,
+                 int in_size,
+                 int out_size,
+                 ImageFormat dstFormat,
+                 int dsth,
+                 int dstw,
+                 std::string dst_path,
+                 int test_iter = 1) {
+  // out
+  uint8_t* resize_cv = new uint8_t[out_size];
+  uint8_t* resize_lite = new uint8_t[out_size];
+  cv::Mat im_resize;
 
-        begin = clock();
-        // resize default linear
-        cv::resize(im_convert, im_resize, cv::Size(dstw, dsth), 0.f, 0.f);
-        end = clock();
-        to_2 += (end - begin);
+  double to_cv = 0.0;
+  double to_lite = 0.0;
+  std::cout << "opencv compute:" << std::endl;
+  if (cv_run) {
+    for (int i = 0; i < test_iter; i++) {
+      clock_t begin = clock();
+      // resize default linear
+      cv::resize(img, im_resize, cv::Size(dstw, dsth), 0.f, 0.f);
+      clock_t end = clock();
+      to_cv += (end - begin);
+    }
+  }
+  // param
+  std::cout << "lite compute:" << std::endl;
+  for (int i = 0; i < test_iter; i++) {
+    clock_t begin = clock();
+    // resize default linear
+    image_preprocess.imageResize(src, resize_lite);
+    clock_t end = clock();
+    to_lite += (end - begin);
+  }
+  to_cv = 1000 * to_cv / CLOCKS_PER_SEC;
+  to_lite = 1000 * to_lite / CLOCKS_PER_SEC;
 
-        begin = clock();
-        // rotate 90
-        if (rotate == 90) {
-          cv::flip(im_convert.t(), im_rotate, 1);
-        } else if (rotate == 180) {
-          cv::flip(im_convert, im_rotate, -1);
-        } else if (rotate == 270) {
-          cv::flip(im_convert.t(), im_rotate, 0);
-        }
-        end = clock();
-        to_3 += (end - begin);
+  std::cout << "---opencv resize run time: " << to_cv
+            << "ms, avg: " << to_cv / test_iter << std::endl;
+  std::cout << "---lite resize run time: " << to_lite
+            << "ms, avg: " << to_lite / test_iter << std::endl;
+  std::cout << "compare diff: " << std::endl;
 
-        begin = clock();
-        // flip
-        cv::flip(im_convert, im_flip, flip);
-        end = clock();
-        to_4 += (end - begin);
-        clock_t ovet = clock();
-        to1 += (ovet - start);
+  if (cv_run) {
+    resize_cv = im_resize.data;
+    uint8_t* diff_v = new uint8_t[out_size];
+    double diff = compare_diff(resize_cv, resize_lite, out_size, diff_v);
+    if (diff > 10) {
+      std::cout << "din: " << std::endl;
+      print_data(src, in_size);
+      std::cout << "cv out: " << std::endl;
+      print_data(resize_cv, out_size);
+      std::cout << "lite out: " << std::endl;
+      print_data(resize_lite, out_size);
+      std::cout << "diff out: " << std::endl;
+      print_data(diff_v, out_size);
+      return false;
+    } else {
+      // save_img
+      std::cout << "write image: " << std::endl;
+      std::string resize_name = dst_path + "/resize.jpg";
+      cv::Mat resize_mat;
+      int num = 1;
+      if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) {
+        resize_mat = cv::Mat(dsth, dstw, CV_8UC3);
+        num = 3;
+      } else if (dstFormat == ImageFormat::BGRA ||
+                 dstFormat == ImageFormat::RGBA) {
+        resize_mat = cv::Mat(dsth, dstw, CV_8UC4);
+        num = 4;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        resize_mat = cv::Mat(dsth, dstw, CV_8UC1);
+        num = 1;
+      } else if (dstFormat == ImageFormat::NV12) {
+        resize_mat = cv::Mat(dsth, dstw, CV_8UC2);
+        num = 2;
       }
+      fill_with_mat(resize_mat, resize_lite, num);
+      cv::imwrite(resize_name, resize_mat);
+      std::cout << "resize successed!" << std::endl;
+      return true;
+    }
+  }
+}
+
+void test_custom(bool has_img,  // input is image
+                 std::string img_path,
+                 std::string in_txt,
+                 std::string dst_path,
+                 ImageFormat srcFormat,
+                 ImageFormat dstFormat,
+                 int srcw,
+                 int srch,
+                 int dstw,
+                 int dsth,
+                 float rotate,
+                 FlipParam flip,
+                 int test_iter = 1) {
+  // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12,
+  cv::Mat img;
+  uint8_t* src = nullptr;
+  int in_size = 0;
+  if (has_img) {
+    if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+      img = imread(img_path, cv::IMREAD_COLOR);
+    } else if (srcFormat == ImageFormat::GRAY) {
+      img = imread(img_path, cv::IMREAD_GRAYSCALE);
+    } else {
+      printf("this format %d does not support \n", srcFormat);
+      return;
+    }
+    srcw = img.cols;
+    srch = img.rows;
+    src = img.data;
+  }
+  bool cv_run = true;
+  if (srcFormat == ImageFormat::GRAY) {
+    std::cout << "srcFormat: GRAY" << std::endl;
+    cv_run = false;
+  } else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+    in_size = 3 * srch * srcw;
+    std::cout << "srcFormat: BGR/RGB" << std::endl;
+  } else if (srcFormat == ImageFormat::RGBA || srcFormat == ImageFormat::BGRA) {
+    in_size = 4 * srch * srcw;
+    std::cout << "srcFormat: BGRA/RGBA" << std::endl;
+  } else if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+    in_size = (3 * srch * srcw) / 2;
+    cv_run = false;
+    std::cout << "srcFormat: NV12/NV12" << std::endl;
+  }
+  int out_size = dstw * dsth;
+  // out
+  if (dstFormat == ImageFormat::GRAY) {
+    std::cout << "dstFormat: GRAY" << std::endl;
+  } else if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) {
+    out_size = 3 * dsth * dstw;
+    std::cout << "dstFormat: BGR/RGB" << std::endl;
+  } else if (dstFormat == ImageFormat::RGBA || dstFormat == ImageFormat::BGRA) {
+    out_size = 4 * dsth * dstw;
+    std::cout << "dstFormat: BGRA/RGBA" << std::endl;
+  } else if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
+    out_size = (3 * dsth * dstw) / 2;
+    cv_run = false;
+    std::cout << "dstFormat: NV12/NV12" << std::endl;
+  }
 
-      std::cout << "Paddle-lite compute" << std::endl;
-      double lite_to = 0;
-      double lite_to_1 = 0;
-      double lite_to_2 = 0;
-      double lite_to_3 = 0;
-      double lite_to_4 = 0;
-      double lite_to_5 = 0;
-      TransParam tparam;
-      tparam.ih = srch;
-      tparam.iw = srcw;
-      tparam.oh = dsth;
-      tparam.ow = dstw;
-      tparam.flip_param = flip;
-      tparam.rotate_param = rotate;
+  if (!has_img) {
+    src = new uint8_t[in_size];
+    // read txt
+    FILE* fp = fopen(in_txt.c_str(), "r");
+    for (int i = 0; i < in_size; i++) {
+      fscanf(fp, "%d\n", &src[i]);
+    }
+    fclose(fp);
+    int num = 1;
+    if (srcFormat == ImageFormat::GRAY) {
+      img = cv::Mat(srch, srcw, CV_8UC1);
+    } else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+      img = cv::Mat(srch, srcw, CV_8UC3);
+      num = 3;
+    } else if (srcFormat == ImageFormat::BGRA ||
+               srcFormat == ImageFormat::RGBA) {
+      img = cv::Mat(srch, srcw, CV_8UC4);
+      num = 4;
+    } else if (srcFormat == ImageFormat::NV12 ||
+               srcFormat == ImageFormat::NV21) {
+      img = cv::Mat(srch, srcw, CV_8UC2);
+      num = 2;
+      std::cout << "CV not support NV12";
+    }
+    fill_with_mat(img, src, num);
+    std::string name = dst_path + "input.jpg";
+    cv::imwrite(name, img);  // shurutup
+  }
 
-      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+  TransParam tparam;
+  tparam.ih = srch;
+  tparam.iw = srcw;
+  tparam.oh = srch;
+  tparam.ow = srcw;
+  tparam.flip_param = flip;
+  tparam.rotate_param = rotate;
 
-      for (int i = 0; i < test_iter; ++i) {
-        clock_t start = clock();
-        clock_t begin = clock();
-        image_preprocess.imageConvert(src, lite_dst);
-        clock_t end = clock();
-        lite_to_1 += (end - begin);
+  TransParam tparam1;
+  tparam1.ih = srch;
+  tparam1.iw = srcw;
+  tparam1.oh = dsth;
+  tparam1.ow = dstw;
+  tparam1.flip_param = flip;
+  tparam1.rotate_param = rotate;
 
-        begin = clock();
-        image_preprocess.imageResize(lite_dst, resize_tmp);
-        end = clock();
-        lite_to_2 += (end - begin);
+  ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+  std::cout << "image convert testing" << std::endl;
+  bool re = test_convert(cv_run,
+                         src,
+                         img,
+                         image_preprocess,
+                         in_size,
+                         out_size,
+                         srcFormat,
+                         dstFormat,
+                         srch,
+                         srcw,
+                         dst_path,
+                         test_iter);
+  if (!re) {
+    return;
+  }
+  std::cout << "image resize testing" << std::endl;
+  tparam.oh = dsth;
+  tparam.ow = dstw;
+  ImagePreprocess image_preprocess1(srcFormat, srcFormat, tparam1);
+  re = test_resize(cv_run,
+                   src,
+                   img,
+                   image_preprocess1,
+                   in_size,
+                   out_size,
+                   srcFormat,
+                   dsth,
+                   dstw,
+                   dst_path,
+                   test_iter);
+  if (!re) {
+    return;
+  }
 
-        begin = clock();
-        image_preprocess.imageRotate(
-            lite_dst, tv_out_ratote, (ImageFormat)dstFormat, srcw, srch, 90);
-        end = clock();
-        lite_to_3 += (end - begin);
+  std::cout << "image rotate testing" << std::endl;
+  if (rotate == 90 || rotate == 270) {
+    tparam.oh = srcw;
+    tparam.ow = srch;
+    dsth = srcw;
+    dstw = srch;
+  } else {
+    tparam.oh = srch;
+    tparam.ow = srcw;
+    dsth = srch;
+    dstw = srcw;
+  }
+  ImagePreprocess image_preprocess2(srcFormat, srcFormat, tparam);
+  re = test_rotate(cv_run,
+                   src,
+                   img,
+                   image_preprocess2,
+                   in_size,
+                   out_size,
+                   rotate,
+                   srcFormat,
+                   dsth,
+                   dstw,
+                   dst_path,
+                   test_iter);
+  if (!re) {
+    return;
+  }
+  tparam.oh = srch;
+  tparam.ow = srcw;
+  ImagePreprocess image_preprocess3(srcFormat, srcFormat, tparam);
+  std::cout << "image flip testing" << std::endl;
+  re = test_flip(cv_run,
+                 src,
+                 img,
+                 image_preprocess3,
+                 in_size,
+                 out_size,
+                 flip,
+                 srcFormat,
+                 srch,
+                 srcw,
+                 dst_path,
+                 test_iter);
+  if (!re) {
+    return;
+  }
+}
 
-        begin = clock();
-        image_preprocess.imageFlip(
-            lite_dst, tv_out_flip, (ImageFormat)dstFormat, srcw, srch, flip);
-        end = clock();
-        lite_to_4 += (end - begin);
+#if 0
+void test_all_r(std::string dst_path, int test_iter = 1) {
+  // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12,
+  cv::Mat img;
+  uint8_t* src = nullptr;
+  int in_size = 0;
+  for (auto& srcFormat : {1, 3, 4, 11}) {
+    for (auto& dstFormat : {1, 3, 4, 11}) {
+      for (auto& srcw : {10, 112, 200}) {
+        for (auto& srch : {10, 224, 400}) {
+          for (auto& dstw : {12, 224, 180}) {
+            for (auto& dsth : {12, 224, 320}) {
+              for (auto& flip : {-1, 0, 1}) {
+                for (auto& rotate : {90, 180, 270}) {
+                  TransParam tparam;
+                  tparam.ih = srch;
+                  tparam.iw = srcw;
+                  tparam.oh = srch;
+                  tparam.ow = srcw;
+                  tparam.flip_param = (FlipParam)flip;
+                  tparam.rotate_param = rotate;
 
-        clock_t over = clock();
-        lite_to += (over - start);
+                  TransParam tparam1;
+                  tparam1.ih = srch;
+                  tparam1.iw = srcw;
+                  tparam1.oh = dsth;
+                  tparam1.ow = dstw;
+                  tparam1.flip_param = (FlipParam)flip;
+                  tparam.rotate_param = rotate;
 
-        begin = clock();
-        image_preprocess.image2Tensor(lite_dst,
-                                      &dst_tensor,
-                                      (ImageFormat)dstFormat,
-                                      srcw,
-                                      srch,
-                                      layout,
-                                      means,
-                                      scales);
-        end = clock();
-        lite_to_5 += (end - begin);
-      }
-      to_1 = 1000 * to_1 / CLOCKS_PER_SEC;
-      to_2 = 1000 * to_2 / CLOCKS_PER_SEC;
-      to_3 = 1000 * to_3 / CLOCKS_PER_SEC;
-      to_4 = 1000 * to_4 / CLOCKS_PER_SEC;
-      to1 = 1000 * to1 / CLOCKS_PER_SEC;
-      std::cout << "opencv convert run time: " << to_1
-                << "ms, avg: " << to_1 / test_iter << std::endl;
-      std::cout << "opencv resize run time: " << to_2
-                << "ms, avg: " << to_2 / test_iter << std::endl;
-      std::cout << "opencv rotate run time: " << to_3
-                << "ms, avg: " << to_3 / test_iter << std::endl;
-      std::cout << "opencv flip  time: " << to_4
-                << "ms, avg: " << to_4 / test_iter << std::endl;
-      std::cout << "opencv total run time: " << to1
-                << "ms, avg: " << to1 / test_iter << std::endl;
-      std::cout << "------" << std::endl;
+                  ImagePreprocess image_preprocess(
+                      (ImageFormat)srcFormat, (ImageFormat)dstFormat, tparam);
+                  ImagePreprocess image_preprocess1(
+                      (ImageFormat)srcFormat, (ImageFormat)srcFormat, tparam1);
+                  ImagePreprocess image_preprocess2(
+                      (ImageFormat)srcFormat, (ImageFormat)srcFormat, tparam);
+                  int h = srch;
+                  int w = srcw;
+                  if (rotate == 90 || rotate == 270) {
+                    tparam.oh = srcw;
+                    h = srcw;
+                    tparam.ow = srch;
+                    w = srch;
+                  }
+                  ImagePreprocess image_preprocess3(
+                      (ImageFormat)srcFormat, (ImageFormat)srcFormat, tparam);
+                  int in_size = srcw * srch;
+                  int out_size = dstw * dsth;
+                  if (srcFormat == ImageFormat::GRAY) {
+                    std::cout << "srcFormat: GRAY" << std::endl;
+                  } else if (srcFormat == ImageFormat::BGR ||
+                             srcFormat == ImageFormat::RGB) {
+                    in_size = 3 * srch * srcw;
+                    std::cout << "srcFormat: BGR/RGB" << std::endl;
+                  } else if (srcFormat == ImageFormat::RGBA ||
+                             srcFormat == ImageFormat::BGRA) {
+                    in_size = 4 * srch * srcw;
+                    std::cout << "srcFormat: BGRA/RGBA" << std::endl;
+                  } else if (srcFormat == ImageFormat::NV12 ||
+                             srcFormat == ImageFormat::NV21) {
+                    in_size = (3 * srch * srcw) / 2;
+                    std::cout << "srcFormat: NV12/NV12" << std::endl;
+                  }
+                  // out
+                  if (dstFormat == ImageFormat::GRAY) {
+                    std::cout << "dstFormat: GRAY" << std::endl;
+                  } else if (dstFormat == ImageFormat::BGR ||
+                             dstFormat == ImageFormat::RGB) {
+                    out_size = 3 * dsth * dstw;
+                    std::cout << "dstFormat: BGR/RGB" << std::endl;
+                  } else if (dstFormat == ImageFormat::RGBA ||
+                             dstFormat == ImageFormat::BGRA) {
+                    out_size = 4 * dsth * dstw;
+                    std::cout << "dstFormat: BGRA/RGBA" << std::endl;
+                  } else if (dstFormat == ImageFormat::NV12 ||
+                             dstFormat == ImageFormat::NV21) {
+                    out_size = (3 * dsth * dstw) / 2;
+                    std::cout << "dstFormat: NV12/NV12" << std::endl;
+                  }
+                  // init
+                  uint8_t* src = new uint8_t[in_size];
+                  for (int i = 0; i < in_size; i++) {
+                    src[i] = i % 255;
+                  }
+                  cv::Mat img;
+                  int num = 1;
+                  bool cv_run = true;
+                  if (srcFormat == ImageFormat::GRAY) {
+                    img = cv::Mat(srch, srcw, CV_8UC1);
+                    cv_run = false;
+                  } else if (srcFormat == ImageFormat::BGR ||
+                             srcFormat == ImageFormat::RGB) {
+                    img = cv::Mat(srch, srcw, CV_8UC3);
+                    num = 3;
+                  } else if (srcFormat == ImageFormat::BGRA ||
+                             srcFormat == ImageFormat::RGBA) {
+                    img = cv::Mat(srch, srcw, CV_8UC4);
+                    num = 4;
+                  } else if (srcFormat == ImageFormat::NV12 ||
+                             srcFormat == ImageFormat::NV21) {
+                    img = cv::Mat(srch, srcw, CV_8UC2);
+                    num = 2;
+                    cv_run = false;
+                  }
+                  fill_with_mat(img, src, num);
+                  std::string name = dst_path + "input.jpg";
+                  cv::imwrite(name, img);  // shurutup
+                  // convert
+                  bool convert = true;
+                  if (srcFormat == 11 || dstFormat == 11) {
+                    // NV12, cv not support
+                    convert = false;
+                    cv_run = false;
+                  }
+                  if (convert) {
+                    std::cout << "image convert testing";
+                    bool re = test_convert(cv_run,
+                                           src,
+                                           img,
+                                           image_preprocess,
+                                           in_size,
+                                           out_size,
+                                           (ImageFormat)srcFormat,
+                                           (ImageFormat)dstFormat,
+                                           srch,
+                                           srcw,
+                                           dst_path,
+                                           test_iter);
+                    if (!re) {
+                      return;
+                    }
+                  }
 
-      lite_to_1 = 1000 * lite_to_1 / CLOCKS_PER_SEC;
-      lite_to_2 = 1000 * lite_to_2 / CLOCKS_PER_SEC;
-      lite_to_3 = 1000 * lite_to_3 / CLOCKS_PER_SEC;
-      lite_to_4 = 1000 * lite_to_4 / CLOCKS_PER_SEC;
-      lite_to_5 = 1000 * lite_to_5 / CLOCKS_PER_SEC;
-      lite_to = 1000 * lite_to / CLOCKS_PER_SEC;
-      std::cout << "lite convert run time: " << lite_to_1
-                << "ms, avg: " << lite_to_1 / test_iter << std::endl;
-      std::cout << "lite resize run time: " << lite_to_2
-                << "ms, avg: " << lite_to_2 / test_iter << std::endl;
-      std::cout << "lite rotate run time: " << lite_to_3
-                << "ms, avg: " << lite_to_3 / test_iter << std::endl;
-      std::cout << "lite flip  time: " << lite_to_4
-                << "ms, avg: " << lite_to_4 / test_iter << std::endl;
-      std::cout << "lite total run time: " << lite_to
-                << "ms, avg: " << lite_to / test_iter << std::endl;
-      std::cout << "lite img2tensor  time: " << lite_to_5
-                << "ms, avg: " << lite_to_5 / test_iter << std::endl;
-      std::cout << "------" << std::endl;
+                  // resize
+                  std::cout << "image resize testing";
+                  bool re = test_resize(cv_run,
+                                        src,
+                                        img,
+                                        image_preprocess1,
+                                        in_size,
+                                        out_size,
+                                        (ImageFormat)srcFormat,
+                                        dsth,
+                                        dstw,
+                                        dst_path,
+                                        test_iter);
+                  if (convert && !re) {
+                    return;
+                  }
+                  // rotate
+                  std::cout << "image rotate testing";
 
-      double max_ratio = 0;
-      double max_diff = 0;
-      const double eps = 1e-6f;
-      // save_img
-      std::cout << "write image: " << std::endl;
-      std::string resize_name = dst_path + "/resize.jpg";
-      std::string convert_name = dst_path + "/convert.jpg";
-      std::string rotate_name = dst_path + "/rotate.jpg";
-      std::string flip_name = dst_path + "/flip.jpg";
-      cv::Mat resize_mat(dsth, dstw, CV_8UC3);
-      cv::Mat convert_mat(srch, srcw, CV_8UC3);
-      cv::Mat rotate_mat;
-      if (rotate == 90 || rotate == 270) {
-        rotate_mat = cv::Mat(srcw, srch, CV_8UC3);
-      } else {
-        rotate_mat = cv::Mat(srch, srcw, CV_8UC3);
+                  re = test_rotate(cv_run,
+                                   src,
+                                   img,
+                                   image_preprocess3,
+                                   in_size,
+                                   out_size,
+                                   rotate,
+                                   (ImageFormat)srcFormat,
+                                   h,
+                                   w,
+                                   dst_path,
+                                   test_iter);
+                  if (convert && !re) {
+                    return;
+                  }
+                  // flip
+                  std::cout << "image rotate testing";
+                  re = test_flip(cv_run,
+                                 src,
+                                 img,
+                                 image_preprocess2,
+                                 in_size,
+                                 out_size,
+                                 (FlipParam)flip,
+                                 (ImageFormat)srcFormat,
+                                 srch,
+                                 srcw,
+                                 dst_path,
+                                 test_iter);
+                  if (convert && !re) {
+                    return;
+                  }
+                }
+              }
+            }
+          }
+        }
       }
-      cv::Mat flip_mat(srch, srcw, CV_8UC3);
-      fill_with_mat(resize_mat, resize_tmp);
-      fill_with_mat(convert_mat, lite_dst);
-      fill_with_mat(rotate_mat, tv_out_ratote);
-      fill_with_mat(flip_mat, tv_out_flip);
-      cv::imwrite(convert_name, convert_mat);
-      cv::imwrite(resize_name, resize_mat);
-      cv::imwrite(rotate_name, rotate_mat);
-      cv::imwrite(flip_name, flip_mat);
-      delete[] lite_dst;
-      delete[] resize_tmp;
-      delete[] tv_out_ratote;
-      delete[] tv_out_flip;
     }
   }
 }
+#endif
 
 int main(int argc, char** argv) {
   if (argc < 7) {
     std::cerr << "[ERROR] usage: " << argv[0]
-              << " image_path dst_apth srcFormat dstFormat width height\n";
+              << " has_img image_path/txt_path dst_apth srcFormat dstFormat "
+                 "dstw dsth "
+              << "[options] srcw srch flip rotate test_iter\n ";
     exit(1);
   }
-  std::string image_path = argv[1];
-  std::string dst_path = argv[2];
-  int srcFormat = atoi(argv[3]);
-  int dstFormat = atoi(argv[4]);
-  int width = atoi(argv[5]);
-  int height = atoi(argv[6]);
+  bool has_img = atoi(argv[1]);
+  std::string path = argv[2];
+  std::string dst_path = argv[3];
+  int srcFormat = atoi(argv[4]);
+  int dstFormat = atoi(argv[5]);
+  int dstw = atoi(argv[6]);
+  int dsth = atoi(argv[7]);
+  int srcw = 100;
+  int srch = 100;
   int flip = -1;
   float rotate = 90;
-  int layout = 1;
-  std::string model_dir = "mobilenet_v1";
-  if (argc > 7) {
-    model_dir = argv[7];
-  }
-  if (argc > 8) {
-    flip = atoi(argv[8]);
-  }
-  if (argc > 9) {
-    rotate = atoi(argv[9]);
-  }
-  if (argc > 10) {
-    layout = atoi(argv[10]);
+  int test_iter = 10;
+  if (!has_img) {
+    std::cout << "It needs srcw and srch";
+    srcw = atoi(argv[8]);
+    srch = atoi(argv[9]);
+    if (argc > 10) {
+      flip = atoi(argv[10]);
+    }
+    if (argc > 11) {
+      rotate = atoi(argv[11]);
+    }
+    if (argc > 12) {
+      test_iter = atoi(argv[12]);
+    }
+  } else {
+    if (argc > 8) {
+      flip = atoi(argv[8]);
+    }
+    if (argc > 9) {
+      rotate = atoi(argv[9]);
+    }
+    if (argc > 10) {
+      test_iter = atoi(argv[10]);
+    }
   }
-  test_img({3},
-           {1, 2, 4},
-           image_path,
-           dst_path,
-           (ImageFormat)srcFormat,
-           (ImageFormat)dstFormat,
-           width,
-           height,
-           rotate,
-           (FlipParam)flip,
-           (LayoutType)layout,
-           model_dir,
-           20);
+  test_custom(has_img,
+              path,
+              path,
+              dst_path,
+              (ImageFormat)srcFormat,
+              (ImageFormat)dstFormat,
+              srcw,
+              srch,
+              dstw,
+              dsth,
+              rotate,
+              (FlipParam)flip,
+              test_iter);
+#if 0
+  test_all_r(dst_path, test_iter);
+#endif
   return 0;
 }
diff --git a/lite/demo/cxx/test_cv/test_model_cv.cc b/lite/demo/cxx/test_cv/test_model_cv.cc
index 24f408bf4a55ea2d499e39902201597c0e8c6e4e..caa085eecb81e54859c1bdd5cd7c0654175b7a9a 100644
--- a/lite/demo/cxx/test_cv/test_model_cv.cc
+++ b/lite/demo/cxx/test_cv/test_model_cv.cc
@@ -111,7 +111,7 @@ void pre_process(const cv::Mat& img, int width, int height, Tensor dstTensor) {
 #endif
 }
 
-void RunModel(std::string model_dir,
+void RunModel(std::string model_file,
               std::string img_path,
               std::vector<int> input_shape,
               PowerMode power_mode,
@@ -120,7 +120,7 @@ void RunModel(std::string model_dir,
               int warmup = 0) {
   // 1. Set MobileConfig
   MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_file);
   config.set_power_mode(power_mode);
   config.set_threads(thread_num);
 
@@ -161,7 +161,7 @@ void RunModel(std::string model_dir,
   }
   std::cout << "================== Speed Report ==================="
             << std::endl;
-  std::cout << "Model: " << model_dir
+  std::cout << "Model: " << model_file
             << ", power_mode: " << static_cast<int>(power_mode)
             << ", threads num " << thread_num << ", warmup: " << warmup
             << ", repeats: " << test_iter << ", avg time: " << lps / test_iter
@@ -187,10 +187,10 @@ void RunModel(std::string model_dir,
 int main(int argc, char** argv) {
   if (argc < 7) {
     std::cerr << "[ERROR] usage: " << argv[0]
-              << " model_dir image_path input_shape\n";
+              << " model_file image_path input_shape\n";
     exit(1);
   }
-  std::string model_dir = argv[1];
+  std::string model_file = argv[1];
   std::string img_path = argv[2];
   std::vector<int> input_shape;
   input_shape.push_back(atoi(argv[3]));
@@ -213,7 +213,7 @@ int main(int argc, char** argv) {
   if (argc > 10) {
     warmup = atoi(argv[10]);
   }
-  RunModel(model_dir,
+  RunModel(model_file,
            img_path,
            input_shape,
            (PowerMode)power_mode,
diff --git a/lite/demo/cxx/yolov3_detection/yolov3_detection.cc b/lite/demo/cxx/yolov3_detection/yolov3_detection.cc
index a9beb1ed28de1f3c28eb5c03b3b660d518ee10c5..d34319050392c74c3fa552bd24c0ea24245ced99 100644
--- a/lite/demo/cxx/yolov3_detection/yolov3_detection.cc
+++ b/lite/demo/cxx/yolov3_detection/yolov3_detection.cc
@@ -182,10 +182,10 @@ std::vector<Object> detect_object(const float* data,
   return rect_out;
 }
 
-void RunModel(std::string model_dir, std::string img_path) {
+void RunModel(std::string model_file, std::string img_path) {
   // 1. Set MobileConfig
   MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_file);
 
   // 2. Create PaddlePredictor by MobileConfig
   std::shared_ptr<PaddlePredictor> predictor =
@@ -228,11 +228,11 @@ void RunModel(std::string model_dir, std::string img_path) {
 
 int main(int argc, char** argv) {
   if (argc < 3) {
-    std::cerr << "[ERROR] usage: " << argv[0] << " model_dir image_path\n";
+    std::cerr << "[ERROR] usage: " << argv[0] << " model_file image_path\n";
     exit(1);
   }
-  std::string model_dir = argv[1];
+  std::string model_file = argv[1];
   std::string img_path = argv[2];
-  RunModel(model_dir, img_path);
+  RunModel(model_file, img_path);
   return 0;
 }
diff --git a/lite/gen_code/gen_code.cc b/lite/gen_code/gen_code.cc
index 0d8f4d0d192f3563d00bb66778ca4e13a17b93b1..6c43f6e0116d9adfc4fc6f315d5653b2634dfe7b 100644
--- a/lite/gen_code/gen_code.cc
+++ b/lite/gen_code/gen_code.cc
@@ -111,11 +111,11 @@ void Module::AddOpDescHelper(const std::string &op_id,
 
     switch (type) {
       case AttrType::INT:
-        return std::to_string(desc.GetAttr<int>(name));
+        return paddle::lite::to_string(desc.GetAttr<int>(name));
       case AttrType::FLOAT:
-        return std::to_string(desc.GetAttr<float>(name));
+        return paddle::lite::to_string(desc.GetAttr<float>(name));
       case AttrType::BOOLEAN:
-        return std::to_string(desc.GetAttr<bool>(name));
+        return paddle::lite::to_string(desc.GetAttr<bool>(name));
       case AttrType::STRING:
         return "\"" + desc.GetAttr<std::string>(name) + "\"";
       case AttrType::FLOATS: {
diff --git a/lite/gen_code/gen_code.h b/lite/gen_code/gen_code.h
index 58a7959f4eb34cb438bf0e25b49b36110435cc6b..d316eac43f99664fa71cba54b3ab5360852300a0 100644
--- a/lite/gen_code/gen_code.h
+++ b/lite/gen_code/gen_code.h
@@ -153,16 +153,16 @@ class Module {
 
  private:
   std::string WeightUniqueName() const {
-    return "w_" + std::to_string(weight_counter_++);
+    return "w_" + paddle::lite::to_string(weight_counter_++);
   }
   std::string TmpVarUniqueName() const {
-    return "tmp_" + std::to_string(tmp_var_counter_++);
+    return "tmp_" + paddle::lite::to_string(tmp_var_counter_++);
   }
   std::string OpUniqueName() const {
-    return "op_" + std::to_string(op_counter_++);
+    return "op_" + paddle::lite::to_string(op_counter_++);
   }
   std::string KernelUniqueName() const {
-    return "kernel_" + std::to_string(kernel_counter_++);
+    return "kernel_" + paddle::lite::to_string(kernel_counter_++);
   }
 
   std::string DataRepr(const std::string &raw_data, PrecisionType dtype);
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index 60d5e3b5e234ef19cd144100d07441eb4acf48de..7550d770145d92ebd343f96a82c6f34d72c91ea5 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -1,6 +1,6 @@
 # NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered
 # to the model_optimize_tool.
-if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)))
+if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)))
     return()
 endif()
 
@@ -68,6 +68,7 @@ add_kernel(reduce_max_compute_arm ARM extra SRCS reduce_max_compute.cc DEPS ${li
 add_kernel(sequence_expand_compute_arm ARM extra SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(im2sequence_compute_arm ARM extra SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sequence_pool_compute_arm ARM extra SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(sequence_conv_compute_arm ARM extra SRCS sequence_conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(layer_norm_compute_arm ARM extra SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(gather_compute_arm ARM extra SRCS gather_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(reduce_prod_compute_arm ARM extra SRCS reduce_prod_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -82,15 +83,14 @@ add_kernel(conditional_block_compute_arm ARM extra SRCS conditional_block_comput
 add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
-
 # for OCR specific
 add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(gru_compute_arm ARM extra SRCS gru_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(beam_search_decode_compute_arm ARM extra SRCS beam_search_decode_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(logical_compute_arm ARM extra SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(less_than_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(compare_compute_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -99,9 +99,20 @@ add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc D
 add_kernel(read_from_array_compute_arm ARM extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(fill_constant_batch_size_like_compute_arm ARM basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(is_empty_compute_arm ARM extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
+# 4. training kernels
+add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
+if(LITE_WITH_TRAIN)
+  add_kernel(mean_grad_compute_arm ARM extra SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+  add_kernel(activation_grad_compute_arm ARM basic SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+  add_kernel(elementwise_grad_compute_arm ARM basic SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+  add_kernel(mul_grad_compute_arm ARM extra SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
+  add_kernel(sgd_compute_arm ARM extra SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm)
+endif()
 
 lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
 lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
@@ -121,5 +132,4 @@ if(LITE_BUILD_EXTRA)
     lite_cc_test(test_decode_bboxes_compute_arm SRCS decode_bboxes_compute_test.cc DEPS decode_bboxes_compute_arm)
     lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm)
     lite_cc_test(test_layer_norm_compute_arm SRCS layer_norm_compute_test.cc DEPS layer_norm_compute_arm)
-    lite_cc_test(test_lookup_table_compute_arm SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_arm)
 endif()
diff --git a/lite/kernels/arm/activation_compute.cc b/lite/kernels/arm/activation_compute.cc
index d50049d48748cf7ec43485a12fa7c65c0171a63d..d609716ee53ec584b8340e9b72498ed95afd5820 100644
--- a/lite/kernels/arm/activation_compute.cc
+++ b/lite/kernels/arm/activation_compute.cc
@@ -169,6 +169,16 @@ void RsqrtCompute::Run() {
       x_data, output_data, x_dims.production(), ctx.threads());
 }
 
+void SquareCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  lite::arm::math::act_square<float>(
+      x_data, output_data, x_dims.production(), ctx.threads());
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -260,3 +270,8 @@ REGISTER_LITE_KERNEL(
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
+REGISTER_LITE_KERNEL(
+    square, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SquareCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/activation_compute.h b/lite/kernels/arm/activation_compute.h
index ba1318ea36d01d1c3352679e7b5de12d013c0e84..476d7bb0a32db193d9afb1451507699d0af71736 100644
--- a/lite/kernels/arm/activation_compute.h
+++ b/lite/kernels/arm/activation_compute.h
@@ -139,6 +139,15 @@ class RsqrtCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
   virtual ~RsqrtCompute() = default;
 };
 
+class SquareCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+
+  virtual ~SquareCompute() = default;
+};
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/arm/activation_grad_compute.cc b/lite/kernels/arm/activation_grad_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..137668fa5e0d1bd07e838b3040a31e084a7475c8
--- /dev/null
+++ b/lite/kernels/arm/activation_grad_compute.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/activation_grad_compute.h"
+#include "lite/backends/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void SquareGradCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto out_grad_dims = param.Out_grad->dims();
+  auto out_grad_data = param.Out_grad->data<float>();
+
+  auto x_data = param.X->data<float>();
+  auto x_grad_data = param.X_grad->mutable_data<float>();
+  lite::arm::math::act_square_grad<float>(x_data,
+                                          out_grad_data,
+                                          x_grad_data,
+                                          out_grad_dims.production(),
+                                          ctx.threads());
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(square_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::SquareGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/activation_grad_compute.h b/lite/kernels/arm/activation_grad_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef03f58fa8cd499192aa6edfe3a7c51b49b14f65
--- /dev/null
+++ b/lite/kernels/arm/activation_grad_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class SquareGradCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationGradParam;
+
+  void Run() override;
+
+  virtual ~SquareGradCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/argmax_compute.cc b/lite/kernels/arm/argmax_compute.cc
index ad279e8f8e1f80639c0b2512f89595d01ef062fd..dda38809875e46835c99b35e564473056391d2c6 100644
--- a/lite/kernels/arm/argmax_compute.cc
+++ b/lite/kernels/arm/argmax_compute.cc
@@ -30,6 +30,9 @@ void ArgmaxCompute::Run() {
   lite::Tensor* input = param.X;
   lite::Tensor* output = param.Out;
   int axis = param.Axis;
+  if (axis < 0) {
+    axis += input->dims().size();
+  }
 
   lite::arm::math::argmax_func(input, axis, output);
   return;
@@ -47,5 +50,5 @@ REGISTER_LITE_KERNEL(arg_max,
                      paddle::lite::kernels::arm::ArgmaxCompute,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
     .Finalize();
diff --git a/lite/kernels/arm/argmax_compute_test.cc b/lite/kernels/arm/argmax_compute_test.cc
index 58bdf18474ae69b2bdb863b9818dab41e25bf17b..034d57cdaba77130b319d203c3ae0616720c9d31 100644
--- a/lite/kernels/arm/argmax_compute_test.cc
+++ b/lite/kernels/arm/argmax_compute_test.cc
@@ -33,7 +33,7 @@ void argmax_compute_ref(const operators::ArgmaxParam& param) {
   int axis = param.Axis;
 
   auto x_data = x->data<dtype>();
-  auto output_data = output->mutable_data<dtype>();
+  auto output_data = output->mutable_data<int64_t>();
   DDim x_dims = x->dims();
   DDim output_dims = output->dims();
 
@@ -59,7 +59,7 @@ void argmax_compute_ref(const operators::ArgmaxParam& param) {
                         std::greater<std::pair<dtype, int>>());
 
       // out
-      dtype* out_ptr = output_data + n * out_channel + k;
+      auto* out_ptr = output_data + n * out_channel + k;
       *out_ptr = vec[0].second;
     }
   }
@@ -115,12 +115,12 @@ TEST(argmax_arm, compute) {
           param.Axis = axis;
           argmaxOp.SetParam(param);
           argmaxOp.Launch();
-          auto* output_data = output.mutable_data<float>();
+          auto* output_data = output.mutable_data<int64_t>();
 
           // obtain output_ref_data
           param.Out = &output_ref;
           argmax_compute_ref<float>(param);
-          auto* output_ref_data = output_ref.mutable_data<float>();
+          auto* output_ref_data = output_ref.mutable_data<int64_t>();
 
           // compare
           for (int i = 0; i < output.dims().production(); i++) {
diff --git a/lite/kernels/arm/assign_compute.cc b/lite/kernels/arm/assign_compute.cc
index b0a5529c368c67c30dfb8517a89bb35c5440e122..8398634bb365c628b64e1ddd2b14984d5f2acb59 100644
--- a/lite/kernels/arm/assign_compute.cc
+++ b/lite/kernels/arm/assign_compute.cc
@@ -23,16 +23,9 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-void AssignCompute::PrepareForRun() {
-  //  CHECK_OR_FALSE(param_t.Out);
-}
-
 void AssignCompute::Run() {
-  // LOG(INFO) << "into kernel compute run";
   auto& param = Param<param_t>();
-  const lite::Tensor* input = param.X;
-  lite::Tensor* output = param.Out;
-  output->CopyDataFrom(*input);
+  param.Out->CopyDataFrom(*param.X);
 }
 
 }  // namespace arm
@@ -41,7 +34,7 @@ void AssignCompute::Run() {
 }  // namespace paddle
 
 REGISTER_LITE_KERNEL(
-    assign, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::AssignCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    assign, kARM, kAny, kNCHW, paddle::lite::kernels::arm::AssignCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/assign_compute.h b/lite/kernels/arm/assign_compute.h
index 3f0dd8e281047c4201ba4561dbd60250ce5749d2..e144486b5970b4e4e82c58148e33ccc5b2d37ff4 100644
--- a/lite/kernels/arm/assign_compute.h
+++ b/lite/kernels/arm/assign_compute.h
@@ -22,10 +22,10 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-class AssignCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class AssignCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  public:
   using param_t = operators::AssignParam;
-  void PrepareForRun() override;
+
   void Run() override;
 
   virtual ~AssignCompute() = default;
diff --git a/lite/kernels/arm/assign_value_compute.cc b/lite/kernels/arm/assign_value_compute.cc
index 45f28ba36369cc79d70d683894c8a934b9308863..1d097e336f156966689823f4ef6d0d36bc536545 100644
--- a/lite/kernels/arm/assign_value_compute.cc
+++ b/lite/kernels/arm/assign_value_compute.cc
@@ -58,9 +58,9 @@ void AssignValueCompute::Run() {
 
 REGISTER_LITE_KERNEL(assign_value,
                      kARM,
-                     kFloat,
+                     kAny,
                      kNCHW,
                      paddle::lite::kernels::arm::AssignValueCompute,
                      def)
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/assign_value_compute.h b/lite/kernels/arm/assign_value_compute.h
index f0c33f865bb770adc64a1727521fad10d0516ede..32b1fb41ab733dc3827496833a633dd415f098b9 100644
--- a/lite/kernels/arm/assign_value_compute.h
+++ b/lite/kernels/arm/assign_value_compute.h
@@ -22,7 +22,7 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-class AssignValueCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class AssignValueCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  public:
   using param_t = operators::AssignValueParam;
 
diff --git a/lite/kernels/arm/beam_search_compute.cc b/lite/kernels/arm/beam_search_compute.cc
index 5ac53b3b96d0ba676e2909d6102e9edded5e9a92..437ba070b7eaf2d6edc8ecd2dd161f57c8fac345 100644
--- a/lite/kernels/arm/beam_search_compute.cc
+++ b/lite/kernels/arm/beam_search_compute.cc
@@ -20,8 +20,6 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-void BeamSearchCompute::PrepareForRun() {}
-
 void BeamSearchCompute::Run() {
   auto& ctx = this->ctx_->template As<ARMContext>();
   auto& param = this->Param<operators::BeamSearchParam>();
@@ -50,11 +48,17 @@ REGISTER_LITE_KERNEL(beam_search,
                      kNCHW,
                      paddle::lite::kernels::arm::BeamSearchCompute,
                      def)
-    .BindInput("pre_ids", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("pre_scores", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("ids", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("scores", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("selected_ids", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("selected_scores", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("parent_idx", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("pre_ids",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("pre_scores",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .BindInput("ids", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("scores",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .BindOutput("selected_ids",
+                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("selected_scores",
+                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .BindOutput("parent_idx",
+                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .Finalize();
diff --git a/lite/kernels/arm/beam_search_compute.h b/lite/kernels/arm/beam_search_compute.h
index ebd72732bb25e826c24f20cd28588b170f344268..854696e5b9f40b480f2c92592245e52f46bc8f14 100644
--- a/lite/kernels/arm/beam_search_compute.h
+++ b/lite/kernels/arm/beam_search_compute.h
@@ -25,10 +25,6 @@ namespace arm {
 
 class BeamSearchCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  public:
-  using param_t = operators::BeamSearchParam;
-
-  void PrepareForRun() override;
-
   void Run() override;
 
   ~BeamSearchCompute() {}
diff --git a/lite/kernels/arm/beam_search_decode_compute.cc b/lite/kernels/arm/beam_search_decode_compute.cc
index 49ca51bf697f272dacf55db655bc237aff2cc460..e0d4ae3f13c6b8bf2364ab5d50ec45bb245377c6 100644
--- a/lite/kernels/arm/beam_search_decode_compute.cc
+++ b/lite/kernels/arm/beam_search_decode_compute.cc
@@ -38,7 +38,7 @@ const size_t kSentenceLevel = 1;
 
 template <typename T>
 struct Sentence {
-  std::vector<float> word_ids;
+  std::vector<int64_t> word_ids;
   std::vector<T> scores;
 };
 
@@ -73,7 +73,7 @@ struct BeamSearchDecoder {
 
     std::vector<uint64_t> source_level_lod = {0};
     std::vector<uint64_t> sentence_level_lod = {0};
-    std::vector<float> id_data;
+    std::vector<int64_t> id_data;
     std::vector<T> score_data;
 
     for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
@@ -117,9 +117,9 @@ struct BeamSearchDecoder {
     *(id_tensor->mutable_lod()) = lod;
 
     id_tensor->Resize({static_cast<int64_t>(id_data.size())});
-    auto id_ptr = id_tensor->mutable_data<float>();
+    auto id_ptr = id_tensor->mutable_data<int64_t>();
     TargetCopy(
-        TARGET(kARM), id_ptr, id_data.data(), id_data.size() * sizeof(float));
+        TARGET(kARM), id_ptr, id_data.data(), id_data.size() * sizeof(int64_t));
 
     *(score_tensor->mutable_lod()) = lod;
     score_tensor->Resize({static_cast<int64_t>(score_data.size())});
@@ -169,7 +169,7 @@ struct BeamSearchDecoder {
                  ++candidate_idx) {
               prefix_idx_vector.push_back(prefix_idx);
               size_t idx = prefix_idx_vector.size() - 1;
-              auto cur_id = cur_ids.data<float>()[candidate_idx];
+              auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
               auto cur_score = cur_scores.data<T>()[candidate_idx];
               sentence_vector.at(idx).word_ids.push_back(cur_id);
               sentence_vector.at(idx).scores.push_back(cur_score);
@@ -184,7 +184,7 @@ struct BeamSearchDecoder {
               cur_ids.lod().at(kSentenceLevel)[prefix_idx];
           for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) {
             auto candidate_idx = prefix_idx_vector.at(idx);
-            auto cur_id = cur_ids.data<float>()[candidate_idx];
+            auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
             auto cur_score = cur_scores.data<T>()[candidate_idx];
             if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) {
               // to skip redundant end tokens
@@ -293,8 +293,12 @@ REGISTER_LITE_KERNEL(beam_search_decode,
                      kNCHW,
                      paddle::lite::kernels::arm::BeamSearchDecodeCompute,
                      def)
-    .BindInput("Ids", {LiteType::GetTensorListTy(TARGET(kARM))})
-    .BindInput("Scores", {LiteType::GetTensorListTy(TARGET(kARM))})
-    .BindOutput("SentenceIds", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("SentenceScores", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Ids",
+               {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Scores",
+               {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kFloat))})
+    .BindOutput("SentenceIds",
+                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("SentenceScores",
+                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
     .Finalize();
diff --git a/lite/kernels/arm/calib_compute.cc b/lite/kernels/arm/calib_compute.cc
index 525e5aefd63474cfac09900e9c411ca5e5868311..6dac97dcbc59991d4680ab1a98a54a900573f631 100644
--- a/lite/kernels/arm/calib_compute.cc
+++ b/lite/kernels/arm/calib_compute.cc
@@ -23,24 +23,24 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-void CalibComputeFp32ToInt8::Run() {
-  auto& param = this->Param<operators::CalibParam>();
+template <DataLayoutType DLType>
+void CalibComputeFp32ToInt8<DLType>::Run() {
+  auto& param = this->template Param<operators::CalibParam>();
   std::vector<float> scale = {param.scale};
-  const auto* din = param.input->data<float>();
-  auto* dout = param.output->mutable_data<signed char>();
+  const auto* din = param.input->template data<float>();
+  auto* dout = param.output->template mutable_data<signed char>();
   lite::arm::math::fp32_to_int8(
       din, dout, scale.data(), 1, 1, param.input->numel());
-  return;
 }
 
-void CalibComputeInt8ToFp32::Run() {
-  auto& param = this->Param<operators::CalibParam>();
-  const auto* din = param.input->data<signed char>();
+template <DataLayoutType DLType>
+void CalibComputeInt8ToFp32<DLType>::Run() {
+  auto& param = this->template Param<operators::CalibParam>();
+  const auto* din = param.input->template data<signed char>();
   std::vector<float> scale = {param.scale};
-  auto* dout = param.output->mutable_data<float>();
+  auto* dout = param.output->template mutable_data<float>();
   lite::arm::math::int8_to_fp32(
       din, dout, scale.data(), 1, 1, param.input->numel());
-  return;
 }
 
 }  // namespace arm
@@ -48,43 +48,116 @@ void CalibComputeInt8ToFp32::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(calib,
-                     kARM,
-                     kInt8,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CalibComputeFp32ToInt8,
-                     fp32_to_int8)
+REGISTER_LITE_KERNEL(
+    calib,
+    kARM,
+    kInt8,
+    kNCHW,
+    paddle::lite::kernels::arm::CalibComputeFp32ToInt8<DATALAYOUT(kNCHW)>,
+    fp32_to_int8)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(calib,
-                     kARM,
-                     kInt8,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CalibComputeInt8ToFp32,
-                     int8_to_fp32)
+REGISTER_LITE_KERNEL(
+    calib,
+    kARM,
+    kInt8,
+    kNCHW,
+    paddle::lite::kernels::arm::CalibComputeInt8ToFp32<DATALAYOUT(kNCHW)>,
+    int8_to_fp32)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
     .Finalize();
-REGISTER_LITE_KERNEL(calib_once,
-                     kARM,
-                     kInt8,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CalibComputeFp32ToInt8,
-                     fp32_to_int8)
+
+REGISTER_LITE_KERNEL(
+    calib,
+    kARM,
+    kInt8,
+    kNHWC,
+    paddle::lite::kernels::arm::CalibComputeFp32ToInt8<DATALAYOUT(kNHWC)>,
+    fp32_to_int8)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib,
+    kARM,
+    kInt8,
+    kNHWC,
+    paddle::lite::kernels::arm::CalibComputeInt8ToFp32<DATALAYOUT(kNHWC)>,
+    int8_to_fp32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib_once,
+    kARM,
+    kInt8,
+    kNCHW,
+    paddle::lite::kernels::arm::CalibComputeFp32ToInt8<DATALAYOUT(kNCHW)>,
+    fp32_to_int8)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(calib_once,
-                     kARM,
-                     kInt8,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CalibComputeInt8ToFp32,
-                     int8_to_fp32)
+REGISTER_LITE_KERNEL(
+    calib_once,
+    kARM,
+    kInt8,
+    kNCHW,
+    paddle::lite::kernels::arm::CalibComputeInt8ToFp32<DATALAYOUT(kNCHW)>,
+    int8_to_fp32)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib_once,
+    kARM,
+    kInt8,
+    kNHWC,
+    paddle::lite::kernels::arm::CalibComputeFp32ToInt8<DATALAYOUT(kNHWC)>,
+    fp32_to_int8)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib_once,
+    kARM,
+    kInt8,
+    kNHWC,
+    paddle::lite::kernels::arm::CalibComputeInt8ToFp32<DATALAYOUT(kNHWC)>,
+    int8_to_fp32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/arm/calib_compute.h b/lite/kernels/arm/calib_compute.h
index 8d9a32bc245579b861607389bac3a4258a0e7abe..a4c8b4c1232101416e95171d70ab629f6a37177b 100644
--- a/lite/kernels/arm/calib_compute.h
+++ b/lite/kernels/arm/calib_compute.h
@@ -21,8 +21,9 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
+template <DataLayoutType DLType>
 class CalibComputeFp32ToInt8
-    : public KernelLite<TARGET(kARM), PRECISION(kInt8)> {
+    : public KernelLite<TARGET(kARM), PRECISION(kInt8), DLType> {
  public:
   using param_t = operators::CalibParam;
 
@@ -33,8 +34,9 @@ class CalibComputeFp32ToInt8
  private:
 };
 
+template <DataLayoutType DLType>
 class CalibComputeInt8ToFp32
-    : public KernelLite<TARGET(kARM), PRECISION(kInt8)> {
+    : public KernelLite<TARGET(kARM), PRECISION(kInt8), DLType> {
  public:
   using param_t = operators::CalibParam;
 
diff --git a/lite/kernels/arm/cast_compute.cc b/lite/kernels/arm/cast_compute.cc
index 266ae1fc916af4303aca274c39b9b4923fdbb154..3b3ef07e105c583b7e3eb8b64b14610ca0f9e41a 100644
--- a/lite/kernels/arm/cast_compute.cc
+++ b/lite/kernels/arm/cast_compute.cc
@@ -73,7 +73,7 @@ void CastCompute::Run() {
 }  // namespace paddle
 
 REGISTER_LITE_KERNEL(
-    cast, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::CastCompute, def)
+    cast, kARM, kAny, kNCHW, paddle::lite::kernels::arm::CastCompute, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/cast_compute.h b/lite/kernels/arm/cast_compute.h
index d342a405ad593b8457b2899fa3ee6ae843d8f792..1f8da056a8be61de20b5d6e98e455e850b9c9f8d 100644
--- a/lite/kernels/arm/cast_compute.h
+++ b/lite/kernels/arm/cast_compute.h
@@ -23,7 +23,7 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-class CastCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class CastCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  public:
   using param_t = operators::CastParam;
 
diff --git a/lite/kernels/arm/compare_compute.cc b/lite/kernels/arm/compare_compute.cc
index 6118cbc6e403645cada84d2434497b084636a4a3..709942a0d9f385e4ba55be32657633c0edc378cf 100644
--- a/lite/kernels/arm/compare_compute.cc
+++ b/lite/kernels/arm/compare_compute.cc
@@ -73,8 +73,6 @@ inline void get_mid_dims(const lite::DDim &x_dims,
     (*post) *= x_dims[i];
   }
 }
-template <template <typename T> class Functor>
-void CompareCompute<Functor>::PrepareForRun() {}
 
 template <template <typename T> class Functor>
 void CompareCompute<Functor>::Run() {
@@ -148,33 +146,70 @@ void CompareCompute_int32<Functor>::Run() {
   }
 }
 
+template <template <typename T> class Functor>
+void CompareCompute_int64<Functor>::Run() {
+  auto &param = this->Param<operators::CompareParam>();
+
+  using CompareFunctor = Functor<int64_t>;
+
+  const size_t x_size = param.X->numel();
+  const size_t y_size = param.Y->numel();
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  bool *z = param.Out->template mutable_data<bool>();
+  const auto *x = param.X->template data<int64_t>();
+  const auto *y = param.Y->template data<int64_t>();
+  auto axis = param.axis;
+  bool force_cpu = param.force_cpu;
+  if (x_size == y_size) {
+    for (int i = 0; i < x_size; ++i) {
+      z[i] = CompareFunctor()(x[i], y[i]);
+    }
+  } else {
+    int axis = (param.axis == -1 ? x_dims.size() - y_dims.size() : param.axis);
+    int outer_num, mid_num, inner_num;
+    get_mid_dims(x_dims, y_dims, axis, &outer_num, &mid_num, &inner_num);
+    for (int outer_id = 0; outer_id < outer_num; ++outer_id) {
+      for (int mid_id = 0; mid_id < mid_num; ++mid_id) {
+        auto y_data = y[mid_id];
+        for (int inner_id = 0; inner_id < inner_num; ++inner_id) {
+          int index = (outer_id * mid_num + mid_id) * inner_num + inner_id;
+          z[index] = CompareFunctor()(x[index], y_data);
+        }
+      }
+    }
+  }
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(less_than,
+REGISTER_LITE_KERNEL(equal,
                      kARM,
                      kFloat,
                      kNCHW,
                      paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_LessThanFunctor>,
+                         paddle::lite::kernels::arm::_EqualFunctor>,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
     .Finalize();
+
 REGISTER_LITE_KERNEL(equal,
                      kARM,
-                     kFloat,
+                     kInt32,
                      kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
+                     paddle::lite::kernels::arm::CompareCompute_int32<
                          paddle::lite::kernels::arm::_EqualFunctor>,
                      def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
     .Finalize();
+
 REGISTER_LITE_KERNEL(not_equal,
                      kARM,
                      kFloat,
@@ -186,60 +221,75 @@ REGISTER_LITE_KERNEL(not_equal,
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
     .Finalize();
-REGISTER_LITE_KERNEL(less_equal,
+
+REGISTER_LITE_KERNEL(less_than,
                      kARM,
                      kFloat,
                      kNCHW,
                      paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_LessEqualFunctor>,
+                         paddle::lite::kernels::arm::_LessThanFunctor>,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
     .Finalize();
-REGISTER_LITE_KERNEL(greater_than,
+
+REGISTER_LITE_KERNEL(less_than,
                      kARM,
-                     kFloat,
+                     kInt32,
                      kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_GreaterThanFunctor>,
+                     paddle::lite::kernels::arm::CompareCompute_int32<
+                         paddle::lite::kernels::arm::_LessThanFunctor>,
                      def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
     .Finalize();
-REGISTER_LITE_KERNEL(greater_equal,
+
+REGISTER_LITE_KERNEL(less_than,
+                     kARM,
+                     kInt64,
+                     kNCHW,
+                     paddle::lite::kernels::arm::CompareCompute_int64<
+                         paddle::lite::kernels::arm::_LessThanFunctor>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(less_equal,
                      kARM,
                      kFloat,
                      kNCHW,
                      paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_GreaterEqualFunctor>,
+                         paddle::lite::kernels::arm::_LessEqualFunctor>,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(less_than,
+REGISTER_LITE_KERNEL(greater_than,
                      kARM,
-                     kInt32,
+                     kFloat,
                      kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute_int32<
-                         paddle::lite::kernels::arm::_LessThanFunctor>,
+                     paddle::lite::kernels::arm::CompareCompute<
+                         paddle::lite::kernels::arm::_GreaterThanFunctor>,
                      def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(equal,
+REGISTER_LITE_KERNEL(greater_equal,
                      kARM,
-                     kInt32,
+                     kFloat,
                      kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute_int32<
-                         paddle::lite::kernels::arm::_EqualFunctor>,
+                     paddle::lite::kernels::arm::CompareCompute<
+                         paddle::lite::kernels::arm::_GreaterEqualFunctor>,
                      def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
     .Finalize();
diff --git a/lite/kernels/arm/compare_compute.h b/lite/kernels/arm/compare_compute.h
index 474a346a3d7bc922766976934c8d184b0fe4d373..278ce1500f473e53f1f1a21a461ff9f786c94ce5 100644
--- a/lite/kernels/arm/compare_compute.h
+++ b/lite/kernels/arm/compare_compute.h
@@ -26,10 +26,6 @@ namespace arm {
 template <template <typename T> class Functor>
 class CompareCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  public:
-  using param_t = operators::LogicalParam;
-
-  void PrepareForRun() override;
-
   void Run() override;
 
   ~CompareCompute() {}
@@ -39,13 +35,20 @@ template <template <typename T> class Functor>
 class CompareCompute_int32
     : public KernelLite<TARGET(kARM), PRECISION(kInt32)> {
  public:
-  using param_t = operators::LogicalParam;
-
   void Run() override;
 
   ~CompareCompute_int32() {}
 };
 
+template <template <typename T> class Functor>
+class CompareCompute_int64
+    : public KernelLite<TARGET(kARM), PRECISION(kInt64)> {
+ public:
+  void Run() override;
+
+  ~CompareCompute_int64() {}
+};
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc
index 74083b3617f428e4f94f12498e337328d0f1a2a8..fb8529af5a0fa4b92b761e1cd8780859138c2059 100644
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
@@ -58,6 +58,13 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
 
   bool flag_dw_3x3 = (kw == 3) && (kh == 3) && (stride == 1 || stride == 2);
   bool flag_dw_5x5 = (kw == 5) && (kh == 5) && (stride == 1 || stride == 2);
+
+#ifdef __aarch64__
+#else
+  bool flag =
+      (stride == 1 && (paddings[0] > 1 || paddings[2] > 1)) ? false : true;
+  flag_dw_3x3 = flag_dw_3x3 && flag;
+#endif
   bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
 
   /// select conv impl
diff --git a/lite/kernels/arm/conv_winograd.cc b/lite/kernels/arm/conv_winograd.cc
index c5cf0b237fc0548ac2bb7549d3950b3cead2b74c..e433a3f4bb4a7aa553fbb1193ff82779d9af3242 100644
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
@@ -35,7 +35,8 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
   if (last_shape_ == x_dims) {
     return;
   }
-
+  last_shape_ = x_dims;
+  //! update workspace size
   int ic = x_dims[1];
   int ih = x_dims[2];
   int iw = x_dims[3];
@@ -43,6 +44,20 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
   int oh = o_dims[2];
   int ow = o_dims[3];
   int tile_block = 8;
+  auto pad = *(param.paddings);
+  int pad_h = pad[0];
+  int pad_w = pad[2];
+  int oc_pad = (oc + 3) / 4 * 4;
+  int ic_pad = (ic + 3) / 4 * 4;
+  const int new_input_size =
+      (ic + 3) / 4 * 4 * (ih + pad_h * 2) * (iw + pad_w * 2);
+  const int temp_size =
+      (tile_block * ((ic + 3) / 4 + (oc + 3) / 4) * 4 * wino_iw * wino_iw +
+       8 * wino_iw * wino_iw) *
+      threads;
+  workspace_size_ = (temp_size + new_input_size) * sizeof(float);
+
+  //! update trans weights impl
   choose_small_ = ow * oh / (tile_block * threads) < 36 ? true : false;
   if (choose_small_) {
     wino_iw = 4;
@@ -58,21 +73,8 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
     }
     last_function_ = 1;
   }
-  auto pad = *(param.paddings);
-  int pad_h = pad[0];
-  int pad_w = pad[2];
-  int oc_pad = (oc + 3) / 4 * 4;
-  int ic_pad = (ic + 3) / 4 * 4;
-  const int new_input_size =
-      (ic + 3) / 4 * 4 * (ih + pad_h * 2) * (iw + pad_w * 2);
-  const int temp_size =
-      (tile_block * ((ic + 3) / 4 + (oc + 3) / 4) * 4 * wino_iw * wino_iw +
-       8 * wino_iw * wino_iw) *
-      threads;
-  ctx.ExtendWorkspace((temp_size + new_input_size) * sizeof(float));
 
   weights_.Resize({1, 1, 1, wino_iw * wino_iw * oc_pad * ic_pad});
-  ctx.ExtendWorkspace((temp_size + new_input_size) * sizeof(float));
   void* trans_tmp_ptr = malloc(sizeof(float) * wino_iw * wino_iw * oc * ic);
   auto weights_data_ = weights_.mutable_data<float>();
   if (!choose_small_) {
@@ -83,8 +85,6 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
         weights_data_, param.filter->data<float>(), ic, oc, trans_tmp_ptr);
   }
   free(trans_tmp_ptr);
-
-  last_shape_ = x_dims;
 }
 
 template <>
@@ -96,6 +96,7 @@ template <>
 void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->template As<ARMContext>();
+  ctx.ExtendWorkspace(workspace_size_);
   const auto* i_data = param.x->data<float>();
   const auto* w_data = weights_.data<float>();
   const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
diff --git a/lite/kernels/arm/distribute_fpn_proposals_compute.cc b/lite/kernels/arm/distribute_fpn_proposals_compute.cc
index 0871a3e84b42c8bcabbad53a8e98dc1d220714fb..3621c01a3018db9fa817d6a94c61e7e3373a81e4 100644
--- a/lite/kernels/arm/distribute_fpn_proposals_compute.cc
+++ b/lite/kernels/arm/distribute_fpn_proposals_compute.cc
@@ -71,9 +71,9 @@ void DistributeFpnProposalsCompute::Run() {
     for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
       // get the target level of current rois
       float roi_scale = std::sqrt(BBoxArea(rois_data, false));
-      int tgt_lvl = std::floor(
-          std::log2(roi_scale / refer_scale + static_cast<float>(1e-6)) +
-          refer_level);
+      int tgt_lvl =
+          std::floor(log2(roi_scale / refer_scale + static_cast<float>(1e-6)) +
+                     refer_level);
       tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level));
       target_level.push_back(tgt_lvl);
       num_rois_level[tgt_lvl - min_level]++;
diff --git a/lite/kernels/arm/elementwise_compute.cc b/lite/kernels/arm/elementwise_compute.cc
index 94c5e140bace0e08e962ac74b82a3f9b241adb11..8115700f5950ddfcb71df49e6a21528563f23d95 100644
--- a/lite/kernels/arm/elementwise_compute.cc
+++ b/lite/kernels/arm/elementwise_compute.cc
@@ -80,7 +80,11 @@ void ElementwiseAddCompute::Run() {
   auto x_dims = param.X->dims();
   auto y_dims = param.Y->dims();
   int pre, n, post;
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+  if (x_dims.size() < y_dims.size() &&
+      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_add_broadcast(
+        y_data, x_data, out_data, pre, n, post);
+  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
     lite::arm::math::elementwise_add_broadcast(
         x_data, y_data, out_data, pre, n, post);
   } else {
@@ -99,7 +103,15 @@ void ElementwiseAddActivationCompute::Run() {
   auto x_dims = param.X->dims();
   auto y_dims = param.Y->dims();
   int pre, n, post;
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+  if (x_dims.size() < y_dims.size() &&
+      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
+    if (act_type == "relu") {
+      lite::arm::math::elementwise_add_relu_broadcast(
+          y_data, x_data, out_data, pre, n, post);
+    } else {
+      LOG(FATAL) << "unsupported Activation type: " << act_type;
+    }
+  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
     if (act_type == "relu") {
       lite::arm::math::elementwise_add_relu_broadcast(
           x_data, y_data, out_data, pre, n, post);
@@ -125,6 +137,9 @@ void ElementwiseSubCompute::Run() {
   auto x_dims = param.X->dims();
   auto y_dims = param.Y->dims();
   int pre, n, post;
+  if (x_dims.size() < y_dims.size()) {
+    LOG(FATAL) << "elewise div don't support x_dims size < y_dims size";
+  }
   if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
     lite::arm::math::elementwise_sub_broadcast(
         x_data, y_data, out_data, pre, n, post);
@@ -143,6 +158,9 @@ void ElementwiseSubActivationCompute::Run() {
   std::string act_type = param.act_type;
   auto x_dims = param.X->dims();
   auto y_dims = param.Y->dims();
+  if (x_dims.size() < y_dims.size()) {
+    LOG(FATAL) << "elewise div don't support x_dims size < y_dims size";
+  }
   int pre, n, post;
   if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
     if (act_type == "relu") {
@@ -171,7 +189,11 @@ void ElementwiseMulCompute<T, PType>::Run() {
   auto x_dims = param.X->dims();
   auto y_dims = param.Y->dims();
   int pre, n, post;
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+  if (x_dims.size() < y_dims.size() &&
+      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_mul_broadcast<T>(
+        y_data, x_data, out_data, pre, n, post);
+  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
     lite::arm::math::elementwise_mul_broadcast<T>(
         x_data, y_data, out_data, pre, n, post);
   } else {
@@ -180,6 +202,12 @@ void ElementwiseMulCompute<T, PType>::Run() {
   }
 }
 
+template <>
+void ElementwiseMulCompute<int64_t, PRECISION(kInt64)>::Run() {
+  auto& param = this->template Param<operators::ElementwiseParam>();
+  lite::arm::math::elementwise_compute_basic<int64_t>(param, "mul", "");
+}
+
 void ElementwiseMulActivationCompute::Run() {
   auto& param = Param<operators::FusionElementwiseActivationParam>();
   const float* x_data = param.X->data<float>();
@@ -190,7 +218,15 @@ void ElementwiseMulActivationCompute::Run() {
   auto x_dims = param.X->dims();
   auto y_dims = param.Y->dims();
   int pre, n, post;
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+  if (x_dims.size() < y_dims.size() &&
+      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
+    if (act_type == "relu") {
+      lite::arm::math::elementwise_mul_relu_broadcast<float>(
+          y_data, x_data, out_data, pre, n, post);
+    } else {
+      LOG(FATAL) << "unsupported Activation type: " << act_type;
+    }
+  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
     if (act_type == "relu") {
       lite::arm::math::elementwise_mul_relu_broadcast(
           x_data, y_data, out_data, pre, n, post);
@@ -216,7 +252,11 @@ void ElementwiseMaxCompute::Run() {
   auto x_dims = param.X->dims();
   auto y_dims = param.Y->dims();
   int pre, n, post;
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+  if (x_dims.size() < y_dims.size() &&
+      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_max_broadcast(
+        y_data, x_data, out_data, pre, n, post);
+  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
     lite::arm::math::elementwise_max_broadcast(
         x_data, y_data, out_data, pre, n, post);
   } else {
@@ -235,7 +275,15 @@ void ElementwiseMaxActivationCompute::Run() {
   auto x_dims = param.X->dims();
   auto y_dims = param.Y->dims();
   int pre, n, post;
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+  if (x_dims.size() < y_dims.size() &&
+      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
+    if (act_type == "relu") {
+      lite::arm::math::elementwise_max_relu_broadcast<float>(
+          y_data, x_data, out_data, pre, n, post);
+    } else {
+      LOG(FATAL) << "unsupported Activation type: " << act_type;
+    }
+  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
     if (act_type == "relu") {
       lite::arm::math::elementwise_max_relu_broadcast(
           x_data, y_data, out_data, pre, n, post);
@@ -261,6 +309,9 @@ void ElementwiseDivCompute::Run() {
   auto x_dims = param.X->dims();
   auto y_dims = param.Y->dims();
   int pre, n, post;
+  if (x_dims.size() < y_dims.size()) {
+    LOG(FATAL) << "elewise div don't support x_dims size < y_dims size";
+  }
   if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
     lite::arm::math::elementwise_div_broadcast(
         x_data, y_data, out_data, pre, n, post);
@@ -279,6 +330,9 @@ void ElementwiseDivActivationCompute::Run() {
   std::string act_type = param.act_type;
   auto x_dims = param.X->dims();
   auto y_dims = param.Y->dims();
+  if (x_dims.size() < y_dims.size()) {
+    LOG(FATAL) << "elewise div don't support x_dims size < y_dims size";
+  }
   int pre, n, post;
   if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
     if (act_type == "relu") {
@@ -366,6 +420,16 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .Finalize();
 
+using elementwise_mul_int64 =
+    paddle::lite::kernels::arm::ElementwiseMulCompute<int64_t,
+                                                      PRECISION(kInt64)>;
+REGISTER_LITE_KERNEL(
+    elementwise_mul, kARM, kInt64, kNCHW, elementwise_mul_int64, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(
     fusion_elementwise_mul_activation,
     kARM,
diff --git a/lite/kernels/arm/elementwise_grad_compute.cc b/lite/kernels/arm/elementwise_grad_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..84f9157201ad5c010cd7bae7f9c43651c747a1b1
--- /dev/null
+++ b/lite/kernels/arm/elementwise_grad_compute.cc
@@ -0,0 +1,244 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/elementwise_grad_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+inline DDim trim_trailing_singular_dims(const DDim& dims) {
+  // Remove trailing dimensions of size 1 for y
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+
+  std::vector<int64_t> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
+  }
+  if (trim_dims.size() == 0) {
+    return DDim();
+  }
+  return DDim(trim_dims);
+}
+
+inline bool is_broadcast(const DDim& x_dims,
+                         const DDim& y_dims,
+                         int axis,
+                         int* pre,
+                         int* n,
+                         int* post) {
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  DDim y_dim_trim = trim_trailing_singular_dims(y_dims);
+  axis = (y_dim_trim.size() == 0) ? x_dims.size() : axis;
+  if (x_dims.size() == y_dim_trim.size()) {
+    return false;
+  }
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= x_dims[i];
+  }
+  for (int i = 0; i < y_dim_trim.size(); ++i) {
+    CHECK_EQ(x_dims[i + axis], y_dim_trim[i])
+        << "Broadcast dimension mismatch.";
+    (*n) *= y_dim_trim[i];
+  }
+  for (int i = axis + y_dim_trim.size(); i < x_dims.size(); ++i) {
+    (*post) *= x_dims[i];
+  }
+  return true;
+}
+
+void ElementwiseAddGradCompute::Run() {
+  auto& param = Param<operators::ElementwiseGradParam>();
+  const float* x_data = param.X->data<float>();
+  const float* y_data = param.Y->data<float>();
+  const float* out_grad_data = param.OutGrad->data<float>();
+  float* x_grad_data = nullptr;
+  float* y_grad_data = nullptr;
+  if (param.XGrad) {
+    x_grad_data = param.XGrad->mutable_data<float>();
+  }
+  if (param.YGrad) {
+    y_grad_data = param.YGrad->mutable_data<float>();
+  }
+  int axis = param.axis;
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int pre, n, post;
+  if (!param.XGrad) {
+    CHECK(param.YGrad);
+    lite::arm::math::elementwise_add_grad(
+        out_grad_data, y_grad_data, y_dims.production());
+    return;
+  }
+
+  if (!param.YGrad) {
+    CHECK(param.XGrad);
+    lite::arm::math::elementwise_add_grad(
+        out_grad_data, x_grad_data, x_dims.production());
+    return;
+  }
+
+  if (x_dims.size() < y_dims.size() &&
+      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_add_grad_broadcast(
+        out_grad_data, y_grad_data, x_grad_data, pre, n, post);
+  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_add_grad_broadcast(
+        out_grad_data, x_grad_data, y_grad_data, pre, n, post);
+  } else {
+    lite::arm::math::elementwise_add_grad(
+        out_grad_data, x_grad_data, x_dims.production());
+    lite::arm::math::elementwise_add_grad(
+        out_grad_data, y_grad_data, y_dims.production());
+  }
+}
+
+void ElementwiseSubGradCompute::Run() {
+  auto& param = Param<operators::ElementwiseGradParam>();
+  const float* x_data = param.X->data<float>();
+  const float* y_data = param.Y->data<float>();
+  const float* out_data = param.OutGrad->data<float>();
+  float* x_grad_data = nullptr;
+  float* y_grad_data = nullptr;
+  if (param.XGrad) {
+    x_grad_data = param.XGrad->mutable_data<float>();
+  }
+  if (param.YGrad) {
+    y_grad_data = param.YGrad->mutable_data<float>();
+  }
+  int axis = param.axis;
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int pre, n, post;
+
+  if (!param.XGrad || !param.YGrad) {
+    CHECK(param.XGrad || param.YGrad);
+    if (param.XGrad) {
+      lite::arm::math::elementwise_sub_grad(
+          out_data, x_grad_data, y_grad_data, x_dims.production());
+      return;
+    } else {
+      lite::arm::math::elementwise_sub_grad(
+          out_data, x_grad_data, y_grad_data, y_dims.production());
+      return;
+    }
+  }
+
+  if (x_dims.size() < y_dims.size()) {
+    LOG(FATAL) << "elewise sub grad don't support x_dims size < y_dims size";
+  }
+  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_sub_grad_broadcast(
+        out_data, x_grad_data, y_grad_data, pre, n, post);
+  } else {
+    lite::arm::math::elementwise_sub_grad(
+        out_data, x_grad_data, y_grad_data, x_dims.production());
+  }
+}
+
+template <typename T, PrecisionType PType>
+void ElementwiseMulGradCompute<T, PType>::Run() {
+  LOG(FATAL) << "elementwise mul_grad not implement yet";
+}
+
+void ElementwiseMaxGradCompute::Run() {
+  LOG(FATAL) << "elementwise max_grad not implement yet";
+}
+
+void ElementwiseDivGradCompute::Run() {
+  LOG(FATAL) << "elementwise div_grad not implement yet";
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using elementwise_mul_grad_float =
+    paddle::lite::kernels::arm::ElementwiseMulGradCompute<float,
+                                                          PRECISION(kFloat)>;
+
+REGISTER_LITE_KERNEL(elementwise_add_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseAddGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_sub_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseSubGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_div_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseDivGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    elementwise_mul_grad, kARM, kFloat, kNCHW, elementwise_mul_grad_float, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_max_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseMaxGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/elementwise_grad_compute.h b/lite/kernels/arm/elementwise_grad_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..1273d8317410ce6689637e28597f9867702e1c2c
--- /dev/null
+++ b/lite/kernels/arm/elementwise_grad_compute.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class ElementwiseAddGradCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseAddGradCompute() = default;
+};
+
+class ElementwiseSubGradCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseSubGradCompute() = default;
+};
+
+template <typename T, PrecisionType PType>
+class ElementwiseMulGradCompute : public KernelLite<TARGET(kARM), PType> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseMulGradCompute() = default;
+};
+
+class ElementwiseMaxGradCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseMaxGradCompute() = default;
+};
+
+class ElementwiseDivGradCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseDivGradCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/fill_constant_batch_size_like_compute.cc b/lite/kernels/arm/fill_constant_batch_size_like_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3a8a09020f11e9cc84dc4891512b6581372e7085
--- /dev/null
+++ b/lite/kernels/arm/fill_constant_batch_size_like_compute.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/fill_constant_batch_size_like_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void FillConstantBatchSizeLikeCompute::Run() {
+  auto& param = *param_.get_mutable<param_t>();
+  auto& context = ctx_->As<ARMContext>();
+
+  if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
+    auto data = param.out->template mutable_data<float>();
+    for (int i = 0; i < param.out->numel(); i++) {
+      data[i] = param.value;
+    }
+  } else if (param.dtype ==
+             static_cast<int32_t>(lite::core::FluidType::INT32)) {
+    auto data = param.out->template mutable_data<int32_t>();
+    for (int i = 0; i < param.out->numel(); i++) {
+      data[i] = param.value;
+    }
+  } else if (param.dtype == static_cast<int32_t>(lite::core::FluidType::INT8)) {
+    auto data = param.out->template mutable_data<int8_t>();
+    for (int i = 0; i < param.out->numel(); i++) {
+      data[i] = param.value;
+    }
+  } else if (param.dtype ==
+             static_cast<int32_t>(lite::core::FluidType::INT64)) {
+    auto data = param.out->template mutable_data<int64_t>();
+    for (int i = 0; i < param.out->numel(); i++) {
+      data[i] = param.value;
+    }
+  } else {
+    LOG(FATAL) << "not supported dtype " << param.dtype;
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    fill_constant_batch_size_like,
+    kARM,
+    kAny,
+    kNCHW,
+    paddle::lite::kernels::arm::FillConstantBatchSizeLikeCompute,
+    def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .Finalize();
diff --git a/lite/kernels/arm/fill_constant_batch_size_like_compute.h b/lite/kernels/arm/fill_constant_batch_size_like_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..23aa64bb6417ae1ed0b551520096cf6401ec702c
--- /dev/null
+++ b/lite/kernels/arm/fill_constant_batch_size_like_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class FillConstantBatchSizeLikeCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
+ public:
+  using param_t = operators::FillConstantBatchSizeLikeParam;
+
+  void Run() override;
+
+  ~FillConstantBatchSizeLikeCompute() {}
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/fill_constant_compute.cc b/lite/kernels/arm/fill_constant_compute.cc
index ad475538576b9cc73a43bac49cba1a6cf1c73edb..3d8fb9aee83dcaaa39bc94e98e8487c1bf0bf15c 100644
--- a/lite/kernels/arm/fill_constant_compute.cc
+++ b/lite/kernels/arm/fill_constant_compute.cc
@@ -12,118 +12,43 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
+#include "lite/kernels/arm/fill_constant_compute.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace arm {
 
-class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
- public:
-  using param_t = operators::FillConstantParam;
+void FillConstantCompute::Run() {
+  auto& param = *param_.get_mutable<param_t>();
+  auto& context = ctx_->As<ARMContext>();
 
-  inline DDimLite GetShape(const param_t& param) {
-    // 1. shape is a Tensor
-    if (param.shape_tensor != nullptr) {
-      auto* shape_tensor = param.shape_tensor;
-      auto* shape_data = shape_tensor->data<int>();
-      auto vec_shape =
-          std::vector<int64_t>(shape_data, shape_data + shape_tensor->numel());
-      return DDimLite(vec_shape);
+  if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
+    auto data = param.out->template mutable_data<float>();
+    for (int i = 0; i < param.out->numel(); i++) {
+      data[i] = param.value;
     }
-
-    // 2. shape is a list/tuple containing Tensor
-    auto shape_tensor_list = param.shape_tensor_list;
-    if (shape_tensor_list.size() > 0) {
-      std::vector<int64_t> vec_shape;
-      for (size_t i = 0; i < shape_tensor_list.size(); ++i) {
-        auto tensor = shape_tensor_list[i];
-        vec_shape.push_back(*tensor->data<int>());
-      }
-      return DDimLite(vec_shape);
+  } else if (param.dtype ==
+             static_cast<int32_t>(lite::core::FluidType::INT32)) {
+    auto data = param.out->template mutable_data<int32_t>();
+    for (int i = 0; i < param.out->numel(); i++) {
+      data[i] = param.value;
     }
-
-    // 3. shape is a list/tuple without containing Tensor
-    auto vec_shape = param.shape;
-    return DDimLite(vec_shape);
-  }
-
-  void PrepareForRun() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto outdims = GetShape(param);
-    param.Out->Resize(outdims);
-  }
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<ARMContext>();
-
-    if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
-      auto data = param.Out->template mutable_data<float>();
-      for (int i = 0; i < param.Out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else if (param.dtype ==
-               static_cast<int32_t>(lite::core::FluidType::INT32)) {
-      auto data = param.Out->template mutable_data<int32_t>();
-      for (int i = 0; i < param.Out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else if (param.dtype ==
-               static_cast<int32_t>(lite::core::FluidType::INT8)) {
-      auto data = param.Out->template mutable_data<int8_t>();
-      for (int i = 0; i < param.Out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else {
-      LOG(FATAL) << "not supported dtype " << param.dtype;
+  } else if (param.dtype == static_cast<int32_t>(lite::core::FluidType::INT8)) {
+    auto data = param.out->template mutable_data<int8_t>();
+    for (int i = 0; i < param.out->numel(); i++) {
+      data[i] = param.value;
     }
-  }
-
-  virtual ~FillConstantCompute() = default;
-};
-
-class FillConstantBatchLikeCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
- public:
-  using param_t = operators::FillConstantBatchLikeParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<ARMContext>();
-
-    if (param.input->lod().size() && param.input_dim_idx == 0) {
-      auto odims = param.out->dims();
-      odims[param.output_dim_idx] = param.input->lod().back().size() - 1;
-      param.out->Resize(odims);
-    }
-
-    if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
-      auto data = param.out->template mutable_data<float>();
-      for (int i = 0; i < param.out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else if (param.dtype ==
-               static_cast<int32_t>(lite::core::FluidType::INT32)) {
-      auto data = param.out->template mutable_data<int32_t>();
-      for (int i = 0; i < param.out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else if (param.dtype ==
-               static_cast<int32_t>(lite::core::FluidType::INT8)) {
-      auto data = param.out->template mutable_data<int8_t>();
-      for (int i = 0; i < param.out->numel(); i++) {
-        data[i] = param.value;
-      }
-    } else {
-      LOG(FATAL) << "not supported dtype " << param.dtype;
+  } else if (param.dtype ==
+             static_cast<int32_t>(lite::core::FluidType::INT64)) {
+    auto data = param.out->template mutable_data<int64_t>();
+    for (int i = 0; i < param.out->numel(); i++) {
+      data[i] = param.value;
     }
+  } else {
+    LOG(FATAL) << "not supported dtype " << param.dtype;
   }
-
-  virtual ~FillConstantBatchLikeCompute() = default;
-};
+}
 
 }  // namespace arm
 }  // namespace kernels
@@ -137,19 +62,9 @@ REGISTER_LITE_KERNEL(fill_constant,
                      kNCHW,
                      paddle::lite::kernels::arm::FillConstantCompute,
                      def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("ShapeTensor",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindInput("ShapeTensorList",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
-REGISTER_LITE_KERNEL(fill_constant_batch_size_like,
-                     kARM,
-                     kAny,
-                     kNCHW,
-                     paddle::lite::kernels::arm::FillConstantBatchLikeCompute,
-                     def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
-    .Finalize();
diff --git a/lite/kernels/arm/fill_constant_compute.h b/lite/kernels/arm/fill_constant_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..7717c4c2628cff5358cc2011c01cb4b02ee125dc
--- /dev/null
+++ b/lite/kernels/arm/fill_constant_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
+ public:
+  using param_t = operators::FillConstantParam;
+
+  void Run() override;
+
+  ~FillConstantCompute() {}
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/gather_compute.cc b/lite/kernels/arm/gather_compute.cc
index c91b86e53f59deb362470f12ab55332ec9e96e8f..af2abbcb2ed26422331235666a98c74923057b8d 100644
--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
@@ -20,8 +20,6 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-void GatherCompute::PrepareForRun() {}
-
 void GatherCompute::Run() {
   auto& param = this->Param<operators::GatherParam>();
 
@@ -49,7 +47,7 @@ void GatherCompute::Run() {
 }  // namespace paddle
 
 REGISTER_LITE_KERNEL(
-    gather, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::GatherCompute, def)
+    gather, kARM, kAny, kNCHW, paddle::lite::kernels::arm::GatherCompute, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("Index",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
diff --git a/lite/kernels/arm/gather_compute.h b/lite/kernels/arm/gather_compute.h
index eb667f132b7975de4f74a43ae24475153aca058e..9753f42972407b250886afa6bada8861a642e189 100644
--- a/lite/kernels/arm/gather_compute.h
+++ b/lite/kernels/arm/gather_compute.h
@@ -22,12 +22,9 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace arm {
-class GatherCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::GatherParam;
-
-  void PrepareForRun() override;
 
+class GatherCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
+ public:
   void Run() override;
 
   ~GatherCompute() {}
diff --git a/lite/kernels/arm/increment_compute.cc b/lite/kernels/arm/increment_compute.cc
index 2cf66805263ca5ee82174421ca037f72f4527b87..ba592653a45aa0488ccc90bb819bcbb4b843a594 100644
--- a/lite/kernels/arm/increment_compute.cc
+++ b/lite/kernels/arm/increment_compute.cc
@@ -20,17 +20,27 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-void IncrementCompute::PrepareForRun() {}
-
 void IncrementCompute::Run() {
   auto& ctx = this->ctx_->template As<ARMContext>();
   auto& param = this->Param<operators::IncrementParam>();
 
   int total_num = param.X->dims().production();
-
-  const auto* x_data = param.X->data<float>();
-  auto* o_data = param.Out->mutable_data<float>();
-  lite::arm::math::increment(x_data, total_num, param.step, o_data, &ctx);
+  if (param.X->precision() == PRECISION(kFloat)) {
+    const auto* x_data = param.X->data<float>();
+    auto* o_data = param.Out->mutable_data<float>();
+    lite::arm::math::increment(x_data, total_num, param.step, o_data, &ctx);
+  } else if (param.X->precision() == PRECISION(kInt64)) {
+    const auto* x_data = param.X->data<int64_t>();
+    auto* o_data = param.Out->mutable_data<int64_t>();
+    lite::arm::math::increment(x_data, total_num, param.step, o_data, &ctx);
+  } else if (param.X->precision() == PRECISION(kInt32)) {
+    const auto* x_data = param.X->data<int32_t>();
+    auto* o_data = param.Out->mutable_data<int32_t>();
+    lite::arm::math::increment(x_data, total_num, param.step, o_data, &ctx);
+  } else {
+    LOG(FATAL) << "unsupport input type "
+               << PrecisionToStr(param.X->precision());
+  }
 }
 
 }  // namespace arm
@@ -40,10 +50,10 @@ void IncrementCompute::Run() {
 
 REGISTER_LITE_KERNEL(increment,
                      kARM,
-                     kFloat,
+                     kAny,
                      kNCHW,
                      paddle::lite::kernels::arm::IncrementCompute,
                      def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/increment_compute.h b/lite/kernels/arm/increment_compute.h
index d26ddd417760e50605f0caca04e85aa20caa240e..afef5c56ac8b0d44cceacb4211442bcc7d4f41e9 100644
--- a/lite/kernels/arm/increment_compute.h
+++ b/lite/kernels/arm/increment_compute.h
@@ -23,12 +23,8 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-class IncrementCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class IncrementCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  public:
-  using param_t = operators::IncrementParam;
-
-  void PrepareForRun() override;
-
   void Run() override;
 
   ~IncrementCompute() {}
diff --git a/lite/kernels/arm/layout_compute.cc b/lite/kernels/arm/layout_compute.cc
index bc52c5ea3ee452033cfd3c7d559cb88b21ca48f6..d25fdc082f087a99f22ca32145b03f37d70762ab 100644
--- a/lite/kernels/arm/layout_compute.cc
+++ b/lite/kernels/arm/layout_compute.cc
@@ -20,40 +20,50 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-#define NCHWTONHWC(type)                                                  \
-  auto& param = this->template Param<param_t>();                          \
-  auto input = param.x->template data<type>();                            \
-  auto input_dim = param.x->dims();                                       \
-  CHECK(input_dim.size() == 4)                                            \
-      << "NCHW to NHWC should guarantee that the input dims should be 4"; \
-  int n = input_dim[0];                                                   \
-  int c = input_dim[1];                                                   \
-  int h = input_dim[2];                                                   \
-  int w = input_dim[3];                                                   \
-  param.y->Resize({n, h, w, c});                                          \
-  auto output = param.y->template mutable_data<type>(TARGET(kARM));       \
-  if (c == 1) {                                                           \
-    memcpy(output, input, sizeof(type) * n * h * w);                      \
-    return;                                                               \
-  }                                                                       \
+#define NCHWTONHWC(type)                                                 \
+  auto& param = this->template Param<param_t>();                         \
+  auto input = param.x->template data<type>();                           \
+  auto input_dim = param.x->dims();                                      \
+  if (input_dim.size() != 4) {                                           \
+    LOG(WARNING) << "NCHW to NHWC should guarantee that the input dims " \
+                    "should be 4, but received "                         \
+                 << input_dim.size();                                    \
+    param.y->ShareDataWith(*param.x);                                    \
+    return;                                                              \
+  }                                                                      \
+  int n = input_dim[0];                                                  \
+  int c = input_dim[1];                                                  \
+  int h = input_dim[2];                                                  \
+  int w = input_dim[3];                                                  \
+  param.y->Resize({n, h, w, c});                                         \
+  auto output = param.y->template mutable_data<type>(TARGET(kARM));      \
+  if (c == 1) {                                                          \
+    memcpy(output, input, sizeof(type) * n * h * w);                     \
+    return;                                                              \
+  }                                                                      \
   lite::arm::math::NCHW2NHWC<type>(n, c, h * w, input, output);
 
-#define NHWCTONCHW(type)                                                  \
-  auto& param = this->template Param<param_t>();                          \
-  auto input = param.x->template data<type>();                            \
-  auto input_dim = param.x->dims();                                       \
-  CHECK(input_dim.size() == 4)                                            \
-      << "NHWC to NCHW should guarantee that the input dims should be 4"; \
-  int n = input_dim[0];                                                   \
-  int h = input_dim[1];                                                   \
-  int w = input_dim[2];                                                   \
-  int c = input_dim[3];                                                   \
-  param.y->Resize({n, c, h, w});                                          \
-  auto output = param.y->template mutable_data<type>(TARGET(kARM));       \
-  if (c == 1) {                                                           \
-    memcpy(output, input, sizeof(type) * n * h * w);                      \
-    return;                                                               \
-  }                                                                       \
+#define NHWCTONCHW(type)                                                 \
+  auto& param = this->template Param<param_t>();                         \
+  auto input = param.x->template data<type>();                           \
+  auto input_dim = param.x->dims();                                      \
+  if (input_dim.size() != 4) {                                           \
+    LOG(WARNING) << "NHWC to NCHW should guarantee that the input dims " \
+                    "should be 4, but received "                         \
+                 << input_dim.size();                                    \
+    param.y->ShareDataWith(*param.x);                                    \
+    return;                                                              \
+  }                                                                      \
+  int n = input_dim[0];                                                  \
+  int h = input_dim[1];                                                  \
+  int w = input_dim[2];                                                  \
+  int c = input_dim[3];                                                  \
+  param.y->Resize({n, c, h, w});                                         \
+  auto output = param.y->template mutable_data<type>(TARGET(kARM));      \
+  if (c == 1) {                                                          \
+    memcpy(output, input, sizeof(type) * n * h * w);                     \
+    return;                                                              \
+  }                                                                      \
   lite::arm::math::NHWC2NCHW<type>(n, c, h * w, input, output);
 
 template <>
diff --git a/lite/kernels/arm/lod_reset_compute.cc b/lite/kernels/arm/lod_reset_compute.cc
index cb995d265e5eb423868836e932c44f05310b1eea..d79f59911c9dd270886e4c6de0e6acf08904c8c8 100644
--- a/lite/kernels/arm/lod_reset_compute.cc
+++ b/lite/kernels/arm/lod_reset_compute.cc
@@ -24,9 +24,7 @@ void LodResetCompute::PrepareForRun() {}
 void LodResetCompute::Run() {
   auto& ctx = this->ctx_->template As<ARMContext>();
   auto& param = this->Param<operators::LodResetParam>();
-  const auto* x_data = param.X->data<float>();
-  auto* o_data = param.Out->mutable_data<float>();
-  memcpy(o_data, x_data, sizeof(float) * param.X->numel());
+  param.Out->CopyDataFrom(*param.X);
   auto lod = param.Out->mutable_lod();
   if (param.Y) {
     if (param.Y->lod().size()) {
@@ -54,11 +52,11 @@ void LodResetCompute::Run() {
 
 REGISTER_LITE_KERNEL(lod_reset,
                      kARM,
-                     kFloat,
+                     kAny,
                      kNCHW,
                      paddle::lite::kernels::arm::LodResetCompute,
                      def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/lod_reset_compute.h b/lite/kernels/arm/lod_reset_compute.h
index 7ecb271967139dfdc1a52fd769c329a6a5ad1eba..b8ba99af9b0eea39fc9a73304913fce570a1c498 100644
--- a/lite/kernels/arm/lod_reset_compute.h
+++ b/lite/kernels/arm/lod_reset_compute.h
@@ -22,7 +22,7 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace arm {
-class LodResetCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class LodResetCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  public:
   using param_t = operators::LodResetParam;
 
diff --git a/lite/kernels/arm/lookup_table_compute.cc b/lite/kernels/arm/lookup_table_compute.cc
index af9426f3f4a7d9dd0d1260143b7b3e8aea15a034..ac0b53d9cbe92efcf062b54930864823dae46878 100644
--- a/lite/kernels/arm/lookup_table_compute.cc
+++ b/lite/kernels/arm/lookup_table_compute.cc
@@ -28,10 +28,8 @@ namespace arm {
 
 void LookupTableCompute::Run() {
   auto& param = this->Param<param_t>();
-  // inputs
   auto w = param.W;
   auto ids = param.Ids;
-  // outputs
   auto out = param.Out;
 
   auto table_dim = w->dims();
@@ -67,7 +65,7 @@ void LookupTableCompute::Run() {
 
 REGISTER_LITE_KERNEL(lookup_table,
                      kARM,
-                     kFloat,
+                     kAny,
                      kNCHW,
                      paddle::lite::kernels::arm::LookupTableCompute,
                      def)
@@ -78,7 +76,7 @@ REGISTER_LITE_KERNEL(lookup_table,
 
 REGISTER_LITE_KERNEL(lookup_table_v2,
                      kARM,
-                     kFloat,
+                     kAny,
                      kNCHW,
                      paddle::lite::kernels::arm::LookupTableCompute,
                      def)
diff --git a/lite/kernels/arm/lookup_table_compute.h b/lite/kernels/arm/lookup_table_compute.h
index 2b66835e71618e299a2f34513dfcb32988848d74..f0c50f55b6d358d8855c78c27f62d53904c3c29d 100644
--- a/lite/kernels/arm/lookup_table_compute.h
+++ b/lite/kernels/arm/lookup_table_compute.h
@@ -21,7 +21,7 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-class LookupTableCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class LookupTableCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  public:
   using param_t = operators::LookupTableParam;
 
diff --git a/lite/kernels/arm/lookup_table_compute_test.cc b/lite/kernels/arm/lookup_table_compute_test.cc
deleted file mode 100644
index 78748edf39c43c5451f8fa3c4d63bde7405c7078..0000000000000000000000000000000000000000
--- a/lite/kernels/arm/lookup_table_compute_test.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/lookup_table_compute.h"
-#include <gtest/gtest.h>
-#include <cmath>
-#include <string>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void lookup_table_compute_ref(const operators::LookupTableParam &param) {
-  auto *ids_t = param.Ids;
-  auto *output_t = param.Out;
-  int64_t padding_idx = param.padding_idx;
-  auto *ids = ids_t->data<int64_t>();
-  int64_t ids_numel = ids_t->dims().production();
-
-  auto *table_t = param.W;
-  int64_t row_number = table_t->dims()[0];
-  int64_t row_width = table_t->dims()[1];
-
-  auto *table = table_t->data<float>();
-  auto *output = output_t->mutable_data<float>();
-  memset(output, 0, output_t->dims().production() * sizeof(float));
-  for (int64_t i = 0; i < ids_numel; ++i) {
-    if (padding_idx != -1 && ids[i] == padding_idx) {
-      memset(output + i * row_width, 0, row_width * sizeof(float));
-    } else {
-      CHECK_LT(ids[i], row_number);
-      CHECK_GE(ids[i], 0);
-      memcpy(output + i * row_width,
-             table + ids[i] * row_width,
-             row_width * sizeof(float));
-    }
-  }
-}
-
-TEST(lookup_table_arm, retrieve_op) {
-  auto lookup_table =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "lookup_table");
-  ASSERT_FALSE(lookup_table.empty());
-  ASSERT_TRUE(lookup_table.front());
-}
-
-TEST(lookup_table_arm, init) {
-  LookupTableCompute lookup_table;
-  ASSERT_EQ(lookup_table.precision(), PRECISION(kFloat));
-  ASSERT_EQ(lookup_table.target(), TARGET(kARM));
-}
-
-TEST(lookup_table_arm, compute) {
-  LookupTableCompute lookup_table;
-  operators::LookupTableParam param;
-  lite::Tensor w, ids, out, out_ref;
-  int64_t padding_idx = -1;
-
-  auto w_dim = DDim(std::vector<int64_t>({4, 5}));
-  auto ids_dim = DDim(std::vector<int64_t>({3, 2}));
-  auto out_dim = DDim(std::vector<int64_t>({3, 2, 5}));
-
-  w.Resize(w_dim);
-  ids.Resize(ids_dim);
-  out.Resize(out_dim);
-  out_ref.Resize(out_dim);
-
-  auto *w_data = w.mutable_data<float>();
-  auto *ids_data = ids.mutable_data<int64_t>();
-  auto *out_data = out.mutable_data<float>();
-  auto *out_ref_data = out_ref.mutable_data<float>();
-
-  int w_num = w_dim.production();
-  for (int i = 0; i < w_num; i++) {
-    w_data[i] = static_cast<float>(i + 1) / (w_num + 1);
-  }
-  int ids_num = ids_dim.production();
-  for (int i = 0; i < ids_num; i++) {
-    ids_data[i] = i % 4;
-  }
-  int out_num = out_dim.production();
-
-  param.W = &w;
-  param.Ids = &ids;
-  param.Out = &out;
-  lookup_table.SetParam(param);
-  lookup_table.Run();
-  param.Out = &out_ref;
-  lookup_table_compute_ref(param);
-  for (int i = 0; i < out_num; i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(lookup_table, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/lookup_table_dequant_compute.cc b/lite/kernels/arm/lookup_table_dequant_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6127e4279b14c40af1cde14b267581426f9ffaa1
--- /dev/null
+++ b/lite/kernels/arm/lookup_table_dequant_compute.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/lookup_table_dequant_compute.h"
+#include <string>
+#include <vector>
+#include "lite/api/paddle_place.h"
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void dequant(const unsigned char *in,
+             float *out,
+             float min,
+             float max,
+             int emb_size,
+             int pow_2_bits) {
+  float scale = (max - min) / pow_2_bits;
+  for (int i = 0; i < emb_size; ++i) {
+    float x = scale * static_cast<int>(in[i]) + min;
+    out[i] = x;
+  }
+}
+
+void LookupTableDequantCompute::Run() {
+  auto &param = this->Param<param_t>();
+  // inputs
+  auto w = param.W;
+  auto ids = param.Ids;
+  // outputs
+  auto out = param.Out;
+
+  auto table_dim = w->dims();
+  int64_t ids_numel = ids->numel();
+  auto ids_data = ids->data<int64_t>();
+
+  int64_t row_number = table_dim[0];
+  int64_t quant_number = table_dim[1];
+  int64_t row_width = (quant_number - 2) * 4;
+
+  auto table_data = w->data<float>();
+  auto dout = out->mutable_data<float>();
+  int pow_2_bits = static_cast<int>(pow(2, 8));
+
+  for (int64_t i = 0; i < ids_numel; ++i) {
+    int ids_int = ids_data[i];
+    if (param.padding_idx != -1 && ids_data[i] == param.padding_idx) {
+      memset(dout + i * row_width, 0, row_width * sizeof(float));
+    } else {
+      CHECK_LT(ids_data[i], row_number)
+          << "look uptable ids[i] < row_number check failed";
+      CHECK_GE(ids_data[i], 0) << "lookuptable ids[i] >= 0 check failed";
+      float min = *(table_data + ids_data[i] * quant_number);
+      float max = *(table_data + ids_data[i] * quant_number + 1);
+      int offset = ids_data[i] * quant_number + 2;
+      const unsigned char *tensor_buf =
+          reinterpret_cast<const unsigned char *>(table_data + offset);
+      dequant(
+          tensor_buf, dout + i * row_width, min, max, row_width, pow_2_bits);
+
+      // memcpy(dout + i * row_width,
+      //       table_data + ids_int * row_width,
+      //       row_width * sizeof(float));
+    }
+  }
+  *(out->mutable_lod()) = ids->lod();
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(lookup_table_dequant,
+                     kARM,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::arm::LookupTableDequantCompute,
+                     def)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/lookup_table_dequant_compute.h b/lite/kernels/arm/lookup_table_dequant_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1a41dcdf20fb287d3cc2022832cbb9a25e93eb4
--- /dev/null
+++ b/lite/kernels/arm/lookup_table_dequant_compute.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class LookupTableDequantCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
+ public:
+  using param_t = operators::LookupTableDequantParam;
+
+  LookupTableDequantCompute() = default;
+
+  void Run() override;
+
+  virtual ~LookupTableDequantCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/lstm_compute.cc b/lite/kernels/arm/lstm_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5335e230a0569612af863d0b066469d61298b4e4
--- /dev/null
+++ b/lite/kernels/arm/lstm_compute.cc
@@ -0,0 +1,215 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/lstm_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/lstm.h"
+#include "lite/backends/arm/math/sequence2batch.h"
+#include "lite/backends/arm/math/sgemm.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+template <typename T>
+void LstmCompute<T>::Run() {
+  auto& param = this->Param<operators::LstmParam>();
+  auto input = param.Input;
+  auto weight = param.Weight;
+  auto bias = param.Bias;
+  auto hidden_t0 = param.H0;
+  auto cell_t0 = param.C0;
+  auto batch_gate = param.BatchGate;
+  auto hidden_out = param.Hidden;
+  auto cell_out = param.Cell;
+  auto batch_cell_pre_act = param.BatchCellPreAct;
+
+  batch_gate->template mutable_data<T>();
+  hidden_out->template mutable_data<T>();
+  cell_out->template mutable_data<T>();
+
+  bool is_reverse = param.is_reverse;
+  lite::arm::math::LoDTensor2BatchFunctor<T> to_batch;
+  to_batch(*input, batch_gate, true, is_reverse);
+
+  auto in_dims = input->dims();
+  int frame_size = static_cast<int>(in_dims[1] / 4);
+  DDimLite dims(std::vector<int64_t>{in_dims[0], frame_size});
+
+  if (bias) {
+    // checkpoint1
+    lite::arm::math::add_bias_rowwise(batch_gate, bias, 0, 4 * frame_size);
+  }
+
+  lite::arm::math::LstmMetaValue<T> lstm_value;
+  if (bias && param.use_peepholes) {
+    T* bias_data = const_cast<T*>(bias->template data<T>());
+    // the code style in LstmMetaValue will be updated later.
+    lstm_value.check_ig = bias_data + 4 * frame_size;
+    lstm_value.check_fg = lstm_value.check_ig + frame_size;
+    lstm_value.check_og = lstm_value.check_fg + frame_size;
+  } else {
+    lstm_value.check_ig = nullptr;
+    lstm_value.check_fg = nullptr;
+    lstm_value.check_og = nullptr;
+  }
+  lstm_value.prev_state_value = nullptr;
+  Tensor ordered_c0;
+
+  std::vector<uint64_t> order(batch_gate->lod()[2]);
+
+  if (cell_t0) {
+    // Since the batch computing for LSTM reorders the input sequence
+    // according to their length. The initialized cell state also needs
+    // to reorder.
+    lite::arm::math::ReorderInitState<T>(*cell_t0, order, &ordered_c0, true);
+    lstm_value.prev_state_value = ordered_c0.mutable_data<T>();
+  }
+  // Use the local variable as here.
+  Tensor batch_hidden, batch_cell;
+  batch_hidden.Resize(dims);
+  batch_cell.Resize(dims);
+  batch_cell_pre_act->Resize(dims);
+  batch_hidden.mutable_data<T>();
+  batch_cell.mutable_data<T>();
+  batch_cell_pre_act->template mutable_data<T>();
+
+  auto batch_starts = batch_gate->lod()[0];
+  size_t num_batch = batch_starts.size() - 1;
+
+  std::string gate_act = param.gate_activation;
+  std::string cell_act = param.cell_activation;
+  std::string cand_act = param.candidate_activation;
+
+  int matrix_width = batch_gate->numel() / in_dims[0];
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  for (size_t n = 0; n < num_batch; n++) {
+    int bstart = static_cast<int>(batch_starts[n]);
+    int bend = static_cast<int>(batch_starts[n + 1]);
+    auto gate_t = lite::arm::math::row_offset(*batch_gate, bstart);
+    auto out_t = lite::arm::math::row_offset(batch_hidden, bstart);
+    auto cell_t = lite::arm::math::row_offset(batch_cell, bstart);
+    auto cell_pre_act_t =
+        lite::arm::math::row_offset(*batch_cell_pre_act, bstart);
+
+    int cur_batch_size = bend - bstart;
+    operators::ActivationParam act_param;
+    act_param.has_active = false;
+
+    if (n > 0) {
+      int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+      int pre_h_end = pre_h_start + cur_batch_size;
+
+      auto pre_hidden_t =
+          lite::arm::math::row_offset(batch_hidden, pre_h_start);
+      int M = pre_h_end - pre_h_start;
+      int N = matrix_width;
+      int K = frame_size;
+
+      lite::arm::math::sgemm(false,
+                             false,
+                             M,
+                             N,
+                             K,
+                             1,
+                             pre_hidden_t,
+                             K,
+                             weight->template data<T>(),
+                             N,
+                             1,
+                             gate_t,
+                             N,
+                             nullptr,
+                             false,
+                             act_param,
+                             &ctx);
+    } else if (hidden_t0) {
+      // If n == 0 and there is no initialized hidden state, that is to say
+      // the H0 is zeros, the calculation W_h * H0 will be skiped.
+      // If n == 0 and there is initialized hidden state, calculate W_h * H0.
+      // Since the batch computing for LSTM reorders the input sequence
+      // according to their length. The initialized hidden state also needs
+      // to reorder.
+      Tensor ordered_h0;
+      lite::arm::math::ReorderInitState<T>(
+          *hidden_t0, order, &ordered_h0, true);
+      int M = ordered_h0.dims()[0];
+      int N = matrix_width;
+      int K = frame_size;
+      lite::arm::math::sgemm(false,
+                             false,
+                             M,
+                             N,
+                             K,
+                             1,
+                             ordered_h0.data<T>(),
+                             K,
+                             weight->template data<T>(),
+                             N,
+                             1,
+                             gate_t,
+                             N,
+                             nullptr,
+                             false,
+                             act_param,
+                             &ctx);
+    }
+
+    lstm_value.gate_value = gate_t;
+    lstm_value.output_value = out_t;
+    lstm_value.state_value = cell_t;
+    lstm_value.state_active_value = cell_pre_act_t;
+    T cell_clip = 0.0;
+    // checkpoint
+    lite::arm::math::LstmUnitFunctor<T>::compute(lstm_value,
+                                                 frame_size,
+                                                 cur_batch_size,
+                                                 cell_clip,
+                                                 cand_act,
+                                                 gate_act,
+                                                 cell_act,
+                                                 ctx.threads());
+    lstm_value.prev_state_value = lstm_value.state_value;
+  }
+
+  lite::arm::math::Batch2LoDTensorFunctor<T> to_seq;
+  auto* lod_hidden = batch_hidden.mutable_lod();
+  *lod_hidden = batch_gate->lod();
+  to_seq(batch_hidden, hidden_out);
+  auto* lod_cell = batch_cell.mutable_lod();
+  *lod_cell = batch_gate->lod();
+  to_seq(batch_cell, cell_out);
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(lstm,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::LstmCompute<float>,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Hidden", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Cell", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("BatchGate", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("BatchCellPreAct", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/lstm_compute.h b/lite/kernels/arm/lstm_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9c1f570735c86e4b45775f8451137bd87bbc7d4
--- /dev/null
+++ b/lite/kernels/arm/lstm_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+template <typename T>
+class LstmCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~LstmCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/mean_compute.cc b/lite/kernels/arm/mean_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5d1a5b8508c4cd7bd36bf7809c445e57bbef428e
--- /dev/null
+++ b/lite/kernels/arm/mean_compute.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/mean_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void MeanCompute::Run() {
+  auto& param = this->Param<operators::MeanParam>();
+  const auto* input = param.X;
+  auto* output = param.Out;
+  auto x_dim = input->dims();
+  auto x_data = input->data<float>();
+  auto out_data = output->mutable_data<float>();
+
+  int x_size = x_dim.production();
+  float sum = 0;
+  for (int i = 0; i < x_size; i++) {
+    sum += x_data[i];
+  }
+  out_data[0] = sum / x_size;
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    mean, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::MeanCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/mean_compute.h b/lite/kernels/arm/mean_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba4650c6c84e270ba9b174cc09fba4bd63b486f5
--- /dev/null
+++ b/lite/kernels/arm/mean_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/operators/mean_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class MeanCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MeanParam;
+
+  void Run() override;
+
+  virtual ~MeanCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/mean_grad_compute.cc b/lite/kernels/arm/mean_grad_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f7a5be8be1ebd4e02a188ab40026de04b319c76e
--- /dev/null
+++ b/lite/kernels/arm/mean_grad_compute.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/mean_grad_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void MeanGradCompute::Run() {
+  auto& param = this->Param<operators::MeanGradParam>();
+  const auto* input = param.X;
+  const auto* out_grad = param.Out_grad;
+  auto* input_grad = param.X_grad;
+
+  auto out_grad_data = out_grad->data<float>();
+  auto input_data = input->data<float>();
+  auto input_grad_data = input_grad->mutable_data<float>();
+
+  int input_grad_size = input_grad->dims().production();
+
+  // TODO(mapingshuo): use parallel methods to accelerate this for loop
+  for (int i = 0; i < input_grad_size; i++) {
+    input_grad_data[i] = out_grad_data[0] / input_grad_size;
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(mean_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::MeanGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/mean_grad_compute.h b/lite/kernels/arm/mean_grad_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..49a72e9520df95082ba0a518b664b21613baf153
--- /dev/null
+++ b/lite/kernels/arm/mean_grad_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/operators/mean_grad_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class MeanGradCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MeanGradParam;
+
+  void Run() override;
+
+  virtual ~MeanGradCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/mul_grad_compute.cc b/lite/kernels/arm/mul_grad_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..405d61d2ac3e4e060234eac63173e5bdd898d2ae
--- /dev/null
+++ b/lite/kernels/arm/mul_grad_compute.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/mul_grad_compute.h"
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/arm/math/sgemm.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void MulGradCompute::PrepareForRun() {
+  auto& ctx = this->ctx_->template As<ARMContext>();
+}
+
+void MulGradCompute::Run() {
+  // step1 flatten_2d
+  auto& param = Param<param_t>();
+  const auto x_dims = param.x->dims();
+  const auto y_dims = param.y->dims();
+  const auto out_dims = param.output_grad->dims();
+
+  m_ = static_cast<int>(x_dims.Slice(0, param.x_num_col_dims).production());
+
+  k_ = static_cast<int>(
+      x_dims.Slice(param.x_num_col_dims, x_dims.size()).production());
+  n_ = static_cast<int>(
+      y_dims.Slice(param.y_num_col_dims, y_dims.size()).production());
+
+  const auto* out_grad_data = param.output_grad->data<float>();
+  const auto* x_data = param.x->data<float>();
+  const auto* y_data = param.y->data<float>();
+  float* x_grad_data;
+  float* y_grad_data;
+  if (param.x_grad) {
+    x_grad_data = param.x_grad->mutable_data<float>();
+  }
+
+  if (param.y_grad) {
+    y_grad_data = param.y_grad->mutable_data<float>();
+  }
+
+  paddle::lite::operators::ActivationParam act_param;
+  act_param.has_active = false;
+  // out_grad  * y^T = x_grad
+  // (m, n), (n, k) -> (m, k)
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  if (param.x_grad) {
+    if (m_ == 1) {
+      lite::arm::math::sgemv(y_data,
+                             out_grad_data,
+                             x_grad_data,
+                             false,
+                             k_,  // M
+                             n_,  // N
+                             false,
+                             nullptr,
+                             false,
+                             lite_api::ActivationType::kIndentity,
+                             &ctx);
+    } else {
+      paddle::lite::arm::math::sgemm(false,
+                                     true,           // is_transB,
+                                     m_,             // M
+                                     k_,             // N
+                                     n_,             // K
+                                     1.0f,           // alpha
+                                     out_grad_data,  // A
+                                     n_,             // lda
+                                     y_data,         // B
+                                     n_,             // ldb
+                                     0.f,            // beta
+                                     x_grad_data,    // C
+                                     k_,             // ldc
+                                     NULL,           // bias
+                                     false,          // is_bias
+                                     act_param,      // act_param
+                                     &ctx);          // ctx
+    }
+  }
+
+  // x^T * out_grad = y_grad
+  // (k, m) (m, n) -> (k, n)
+  if (param.y_grad) {
+    if (n_ == 1) {
+      lite::arm::math::sgemv(x_data,
+                             out_grad_data,
+                             y_grad_data,
+                             true,
+                             k_,  // M
+                             m_,  // N
+                             false,
+                             nullptr,
+                             false,
+                             lite_api::ActivationType::kIndentity,
+                             &ctx);
+    } else {
+      paddle::lite::arm::math::sgemm(true,           // is_transA
+                                     false,          // is_transB,
+                                     k_,             // M
+                                     n_,             // N
+                                     m_,             // K
+                                     1.0f,           // alpha
+                                     x_data,         // A
+                                     k_,             // lda
+                                     out_grad_data,  // B
+                                     n_,             // ldb
+                                     0.f,            // beta
+                                     y_grad_data,    // C
+                                     n_,             // ldc
+                                     NULL,           // bias
+                                     false,          // is_bias
+                                     act_param,      // act_param
+                                     &ctx);          // ctx
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(mul_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::MulGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/mul_grad_compute.h b/lite/kernels/arm/mul_grad_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..2cdaff3f10ce0a3c0a9509765f858c7371a75f0c
--- /dev/null
+++ b/lite/kernels/arm/mul_grad_compute.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class MulGradCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MulGradParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~MulGradCompute() = default;
+
+ private:
+  int m_, n_, k_;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/pool_compute.cc b/lite/kernels/arm/pool_compute.cc
index 7ff422256336832a68af52896ed1af2be13bf94e..ff6100c4e2c68d7eee0d5d0eeabbb64a1ca699e2 100644
--- a/lite/kernels/arm/pool_compute.cc
+++ b/lite/kernels/arm/pool_compute.cc
@@ -47,13 +47,16 @@ void PoolCompute::Run() {
   bool use_quantizer = param.use_quantizer;
   std::string& data_format = param.data_format;
 
-  bool pads_equal = (paddings[0] == paddings[1]) &&
-                    (paddings[2] == paddings[3]) &&
-                    (paddings[0] == paddings[2]);
+  bool pads_less =
+      (paddings[0] == paddings[2]) && (paddings[1] < 2) && (paddings[3] < 2);
+
+  bool pads_equal = (paddings[0] == paddings[2]) &&
+                    (paddings[0] == paddings[1]) &&
+                    (paddings[2] == paddings[3]);
   bool kps_equal =
-      (ksize[0] == ksize[1]) && (strides[0] == strides[1]) && pads_equal;
+      (ksize[0] == ksize[1]) && (strides[0] == strides[1]) && pads_less;
   bool global_pooling = (paddings[0] == 0) && (ksize[0] == in_dims[2]) &&
-                        (ksize[1] == in_dims[3]) && pads_equal;
+                        (ksize[1] == in_dims[3]) && kps_equal && pads_equal;
   global_pooling = param.global_pooling || global_pooling;
   if (global_pooling) {
     for (size_t i = 0; i < ksize.size(); ++i) {
@@ -96,7 +99,9 @@ void PoolCompute::Run() {
                                             out_dims[3],
                                             in_dims[1],
                                             in_dims[2],
-                                            in_dims[3]);
+                                            in_dims[3],
+                                            paddings[1],
+                                            paddings[3]);
         return;
       }
     } else if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 &&
@@ -110,7 +115,9 @@ void PoolCompute::Run() {
                                           out_dims[3],
                                           in_dims[1],
                                           in_dims[2],
-                                          in_dims[3]);
+                                          in_dims[3],
+                                          paddings[1],
+                                          paddings[3]);
         return;
       } else if (pooling_type == "avg") {
         lite::arm::math::pooling2x2s2_avg(din,
@@ -122,7 +129,9 @@ void PoolCompute::Run() {
                                           in_dims[1],
                                           in_dims[2],
                                           in_dims[3],
-                                          exclusive);
+                                          exclusive,
+                                          paddings[1],
+                                          paddings[3]);
         return;
       }
     } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 &&
@@ -136,7 +145,9 @@ void PoolCompute::Run() {
                                             out_dims[3],
                                             in_dims[1],
                                             in_dims[2],
-                                            in_dims[3]);
+                                            in_dims[3],
+                                            paddings[1],
+                                            paddings[3]);
         return;
       } else if (pooling_type == "avg") {
         lite::arm::math::pooling3x3s1p1_avg(din,
@@ -148,7 +159,9 @@ void PoolCompute::Run() {
                                             in_dims[1],
                                             in_dims[2],
                                             in_dims[3],
-                                            exclusive);
+                                            exclusive,
+                                            paddings[1],
+                                            paddings[3]);
         return;
       }
     } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 0 &&
@@ -162,7 +175,9 @@ void PoolCompute::Run() {
                                             out_dims[3],
                                             in_dims[1],
                                             in_dims[2],
-                                            in_dims[3]);
+                                            in_dims[3],
+                                            paddings[1],
+                                            paddings[3]);
         return;
       } else if (pooling_type == "avg") {
         lite::arm::math::pooling3x3s1p0_avg(din,
@@ -174,7 +189,9 @@ void PoolCompute::Run() {
                                             in_dims[1],
                                             in_dims[2],
                                             in_dims[3],
-                                            exclusive);
+                                            exclusive,
+                                            paddings[1],
+                                            paddings[3]);
         return;
       }
     } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 0 &&
@@ -188,7 +205,9 @@ void PoolCompute::Run() {
                                             out_dims[3],
                                             in_dims[1],
                                             in_dims[2],
-                                            in_dims[3]);
+                                            in_dims[3],
+                                            paddings[1],
+                                            paddings[3]);
         return;
       } else if (pooling_type == "avg") {
         lite::arm::math::pooling3x3s2p0_avg(din,
@@ -200,7 +219,9 @@ void PoolCompute::Run() {
                                             in_dims[1],
                                             in_dims[2],
                                             in_dims[3],
-                                            exclusive);
+                                            exclusive,
+                                            paddings[1],
+                                            paddings[3]);
         return;
       }
     } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 1 &&
@@ -214,7 +235,9 @@ void PoolCompute::Run() {
                                             out_dims[3],
                                             in_dims[1],
                                             in_dims[2],
-                                            in_dims[3]);
+                                            in_dims[3],
+                                            paddings[1],
+                                            paddings[3]);
         return;
       } else if (pooling_type == "avg") {
         lite::arm::math::pooling3x3s2p1_avg(din,
@@ -226,11 +249,14 @@ void PoolCompute::Run() {
                                             in_dims[1],
                                             in_dims[2],
                                             in_dims[3],
-                                            exclusive);
+                                            exclusive,
+                                            paddings[1],
+                                            paddings[3]);
         return;
       }
     }
   }
+
   lite::arm::math::pooling_basic(din,
                                  dout,
                                  out_dims[0],
diff --git a/lite/kernels/arm/pool_compute_test.cc b/lite/kernels/arm/pool_compute_test.cc
index 7ed8a142dda06e2d1b8f9d8afdade0194d87d1e6..acdaf0d0131621c1c2403b8a071d6cb1134f4565 100644
--- a/lite/kernels/arm/pool_compute_test.cc
+++ b/lite/kernels/arm/pool_compute_test.cc
@@ -232,7 +232,7 @@ TEST(pool_arm, compute) {
   lite::Tensor x;
   lite::Tensor output;
   lite::Tensor output_ref;
-
+#if 0
   // speedup for ci
   for (auto pooling_type : {"max", "avg"}) {
     for (auto ceil_mode : {true, false}) {
@@ -337,6 +337,7 @@ TEST(pool_arm, compute) {
       }
     }
   }
+#endif
 }
 
 TEST(pool_arm, retrive_op) {
diff --git a/lite/kernels/arm/read_from_array_compute.cc b/lite/kernels/arm/read_from_array_compute.cc
index 43fcca4221bff188bf37caed33bbc9dba2e2b965..f2aff42f1c978d65f8faa96080ce7c07fc61cb43 100644
--- a/lite/kernels/arm/read_from_array_compute.cc
+++ b/lite/kernels/arm/read_from_array_compute.cc
@@ -20,23 +20,17 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-void ReadFromArrayCompute::PrepareForRun() {}
-
 void ReadFromArrayCompute::Run() {
   auto& ctx = this->ctx_->template As<ARMContext>();
-  auto& param = this->Param<operators::ReadFromArrayParam>();
+  auto& param = this->Param<param_t>();
 
-  int in_num = param.X->size();
   CHECK_EQ(param.I->numel(), 1) << "I should have only one element";
-  int id = param.I->data<float>()[0];
+  int id = param.I->data<int64_t>()[0];
+  int in_num = param.X->size();
   CHECK_LE(id, in_num) << "id is not valid";
-  int input_size = (*param.X)[id].numel();
 
   param.Out->Resize((*param.X)[id].dims());
   param.Out->CopyDataFrom((*param.X)[id]);
-
-  auto out_lod = param.Out->mutable_lod();
-  *out_lod = (*param.X)[id].lod();
 }
 
 }  // namespace arm
@@ -46,11 +40,11 @@ void ReadFromArrayCompute::Run() {
 
 REGISTER_LITE_KERNEL(read_from_array,
                      kARM,
-                     kFloat,
+                     kAny,
                      kNCHW,
                      paddle::lite::kernels::arm::ReadFromArrayCompute,
                      def)
-    .BindInput("X", {LiteType::GetTensorListTy(TARGET(kARM))})
-    .BindInput("I", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kAny))})
+    .BindInput("I", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/read_from_array_compute.h b/lite/kernels/arm/read_from_array_compute.h
index c8ba6d6d258f87c4d91e701bb6490258b71d3ecd..b44f46792a3e517bf0ab0d9fa4f6c49b66700dcf 100644
--- a/lite/kernels/arm/read_from_array_compute.h
+++ b/lite/kernels/arm/read_from_array_compute.h
@@ -23,13 +23,10 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-class ReadFromArrayCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class ReadFromArrayCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  public:
   using param_t = operators::ReadFromArrayParam;
 
-  void PrepareForRun() override;
-
   void Run() override;
 
   ~ReadFromArrayCompute() {}
diff --git a/lite/kernels/arm/scale_compute.cc b/lite/kernels/arm/scale_compute.cc
index 2a46d2212e4f69630e012ae4a497f68db7a01985..71192d7b937116966a5b95a7620805065fdd152e 100644
--- a/lite/kernels/arm/scale_compute.cc
+++ b/lite/kernels/arm/scale_compute.cc
@@ -20,18 +20,18 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-void ScaleCompute::Run() {
-  auto& param = Param<operators::ScaleParam>();
-  const float* x_data = param.x->data<float>();
-  float* output_data = param.output->mutable_data<float>();
-  DDim x_dims = param.x->dims();
-  bool bias_after_scale = param.bias_after_scale;
-  float scale = param.scale;
-  float bias = param.bias;
-  if (!bias_after_scale) {
+template <typename T, PrecisionType PType>
+void ScaleCompute<T, PType>::Run() {
+  auto& param = this->template Param<operators::ScaleParam>();
+  int num = param.x->numel();
+  const T* x_data = param.x->template data<T>();
+  T* output_data = param.output->template mutable_data<T>();
+  T scale = static_cast<T>(param.scale);
+  T bias = static_cast<T>(param.bias);
+  if (!param.bias_after_scale) {
     bias *= scale;
   }
-  lite::arm::math::scale(x_data, output_data, x_dims.production(), scale, bias);
+  lite::arm::math::scale<T>(x_data, output_data, num, scale, bias);
   if (!param.x->lod().empty()) {
     param.output->set_lod(param.x->lod());
   }
@@ -42,8 +42,16 @@ void ScaleCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(
-    scale, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ScaleCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+using scale_float =
+    paddle::lite::kernels::arm::ScaleCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(scale, kARM, kFloat, kNCHW, scale_float, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .Finalize();
+
+using scale_int32 =
+    paddle::lite::kernels::arm::ScaleCompute<int, PRECISION(kInt32)>;
+REGISTER_LITE_KERNEL(scale, kARM, kInt32, kNCHW, scale_int32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .Finalize();
diff --git a/lite/kernels/arm/scale_compute.h b/lite/kernels/arm/scale_compute.h
index 4eacfaf8e1231c52f6235d744f62d106bc947212..b7b81c8f047fa92efad26d277040cdff4333521e 100644
--- a/lite/kernels/arm/scale_compute.h
+++ b/lite/kernels/arm/scale_compute.h
@@ -21,7 +21,8 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-class ScaleCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class ScaleCompute : public KernelLite<TARGET(kARM), PType> {
  public:
   void Run() override;
 
diff --git a/lite/kernels/arm/scale_compute_test.cc b/lite/kernels/arm/scale_compute_test.cc
index 2683f341a23bdf0bb0e534a5df413e91894a3f9f..0d327b9807d306770850b09ed1ed2a0337104c92 100644
--- a/lite/kernels/arm/scale_compute_test.cc
+++ b/lite/kernels/arm/scale_compute_test.cc
@@ -41,13 +41,13 @@ void scale_compute_ref(const operators::ScaleParam& param) {
 }
 
 TEST(scale_arm, init) {
-  ScaleCompute scale;
+  ScaleCompute<float, PRECISION(kFloat)> scale;
   ASSERT_EQ(scale.precision(), PRECISION(kFloat));
   ASSERT_EQ(scale.target(), TARGET(kARM));
 }
 
 TEST(scale_arm, compute) {
-  ScaleCompute scale;
+  ScaleCompute<float, PRECISION(kFloat)> scale;
   operators::ScaleParam param;
 
   lite::Tensor x;
diff --git a/lite/kernels/arm/sequence_conv_compute.cc b/lite/kernels/arm/sequence_conv_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a70b6717097ec0ffdaa24ba257bfdf8dbd536f3f
--- /dev/null
+++ b/lite/kernels/arm/sequence_conv_compute.cc
@@ -0,0 +1,150 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/kernels/arm/sequence_conv_compute.h"
+#include <algorithm>
+#include <cstddef>
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/conv_impl.h"
+#include "lite/backends/arm/math/sgemm.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+template <typename Dtype>
+void local_naive_transpose(const Dtype* din, Dtype* dout, int m, int n) {
+  int k = 0;
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < m; ++j) {
+      dout[k++] = din[j * n + i];
+    }
+  }
+}
+
+void SequenceConvCompute::PrepareForRun() {}
+
+void SequenceConvCompute::Run() {
+  // param.X is in shape: [sequence_len, hidden_dim];
+  // param.Filter is in shape: [kernel_size * hidden_dim, kernel_num]
+  // param.contextLength : kernel_size
+  // param.contextStart: for padding idx
+  // param.Out is in shape [new_sequence_len, kernel_num]
+  auto& param = this->Param<operators::SequenceConvParam>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  const auto* in_data = param.X->data<float>();
+  const auto* filter_data = param.Filter->data<float>();
+  float* out_data = param.Out->mutable_data<float>();
+  int pad_start = param.contextStart;
+  int kernel_size = param.contextLength;
+  int kernel_num = param.Filter->dims()[1];
+  int up_pad = std::max(0, -pad_start);
+  int down_pad = std::max(0, pad_start + kernel_size - 1);
+  auto hidden_dim = static_cast<int64_t>(param.X->dims()[1]);
+  auto sequence_len = static_cast<int64_t>(param.X->dims()[0]);
+  auto lod = param.X->lod();
+
+  // Im2Col
+  lite::Tensor col;
+  lite::Tensor tmp;
+  col.Resize({sequence_len, kernel_size * hidden_dim});
+  auto* col_data = col.mutable_data<float>();
+  auto lod_level_0 = lod[0];
+  int input_row_begin, input_row_end;
+  for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; i++) {
+    if (lod_level_0[i] == lod_level_0[i + 1]) continue;
+    input_row_begin = (pad_start > 0)
+                          ? static_cast<int>(lod_level_0[i]) + pad_start
+                          : static_cast<int>(lod_level_0[i]);
+    input_row_end = static_cast<int>(lod_level_0[i + 1]);
+
+    if (input_row_begin < input_row_end) {
+      // do im2col
+      auto* sub_in_data = in_data + input_row_begin * hidden_dim;
+      auto* sub_col_data =
+          col_data + input_row_begin * kernel_size * hidden_dim;
+      tmp.Resize({kernel_size * hidden_dim, input_row_end - input_row_begin});
+      auto* tmp_data = tmp.mutable_data<float>();
+      // Image Col: [input_channels, filter_height, filter_width, output_height,
+      // output_width]
+      // sequence Col: [1, kernel_size, hidden_dim, sequence_len, 1]
+      paddle::lite::arm::math::im2col(
+          sub_in_data,
+          1,
+          sequence_len,
+          hidden_dim,  // C H W -> 1, seq_len, hidden_dim
+          kernel_size,
+          hidden_dim,  // kernel_h, kernel_w
+          up_pad,
+          down_pad,
+          0,
+          0,  // pad_top, pad_bottom, pad_left, pad_right
+          1,
+          1,
+          1,
+          1,  // stride_h, stride_w, dilation_h, dilation_w
+          tmp_data);
+      local_naive_transpose(tmp_data,
+                            sub_col_data,
+                            kernel_size * hidden_dim,
+                            input_row_end - input_row_begin);
+    }
+  }
+
+  // SGDMM C := alpha * A * B + beta * C
+  // matmul: col * filter_data
+  // [sequence_len, kernel_size * hidden_dim] * [kernel_size * hidden_dim,
+  // kernel_num]
+  // = [sequence_len, kernel_num]
+  paddle::lite::operators::ActivationParam act_param;
+  paddle::lite::arm::math::sgemm(false,
+                                 false,                     // is_transB,
+                                 sequence_len,              // M
+                                 kernel_num,                // N
+                                 kernel_size * hidden_dim,  // K
+                                 1.0f,                      // alpha
+                                 col_data,                  // A
+                                 kernel_size * hidden_dim,  // lda: k
+                                 filter_data,               // B
+                                 kernel_num,                // ldb: n
+                                 0.f,                       // beta
+                                 out_data,                  // C
+                                 kernel_num,                // ldc: n
+                                 NULL,                      // bias
+                                 false,                     // is_bias
+                                 act_param,                 // act_param
+                                 &ctx);                     // ctx
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_conv,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::SequenceConvCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/sequence_conv_compute.h b/lite/kernels/arm/sequence_conv_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..d63b72b006460577ad37c0b68808e02852142e52
--- /dev/null
+++ b/lite/kernels/arm/sequence_conv_compute.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdint.h>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class SequenceConvCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~SequenceConvCompute() = default;
+
+ private:
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/sequence_pool_compute.cc b/lite/kernels/arm/sequence_pool_compute.cc
index 8fcbb8cffe72935e4df503c3c1748ddb68247fb7..53fa5477036757fa70135569129fee115eb52047 100644
--- a/lite/kernels/arm/sequence_pool_compute.cc
+++ b/lite/kernels/arm/sequence_pool_compute.cc
@@ -59,7 +59,8 @@ void SequencePoolCompute::Run() {
   for (int i = 0; i <= batch_size; i++) {
     offset_new[i] = i;
   }
-  (output->mutable_lod())->push_back(offset_new);
+  output->mutable_lod()->clear();
+  output->mutable_lod()->push_back(offset_new);
 }
 
 }  // namespace arm
diff --git a/lite/kernels/arm/sgd_compute.cc b/lite/kernels/arm/sgd_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f045fca8ffafbc567f8b21937142b576567f7bd
--- /dev/null
+++ b/lite/kernels/arm/sgd_compute.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/sgd_compute.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void SGDCompute::Run() {
+  auto& param = this->Param<param_t>();
+  const auto* parameter = param.Param;
+  const auto* grad = param.Grad;
+  const auto* lr_tensor = param.LearningRate;
+  auto* parameter_output = param.ParamOut;
+
+  auto dims = parameter->dims();
+  auto parameter_data = parameter->data<float>();
+  auto grad_data = grad->data<float>();
+  auto lr = *(lr_tensor->data<float>());
+  auto parameter_out_data = parameter_output->mutable_data<float>();
+
+  int element_num = dims.production();
+#pragma omp parallel for
+  for (int i = 0; i < element_num; i++) {
+    parameter_out_data[i] = parameter_data[i] - lr * grad_data[i];
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    sgd, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SGDCompute, def)
+    .BindInput("Param", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Grad", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("LearningRate", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("ParamOut", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/sgd_compute.h b/lite/kernels/arm/sgd_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb3e3931dadfa192162a82c3847346acb2c6c6bc
--- /dev/null
+++ b/lite/kernels/arm/sgd_compute.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class SGDCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SGDParam;
+
+  SGDCompute() = default;
+
+  void Run() override;
+
+  virtual ~SGDCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/topk_compute.cc b/lite/kernels/arm/topk_compute.cc
index c1abd42b41e7d15effd0d7c62f00c2460e54a793..c55bf2aa7861071770c4993800b5a2536d27511f 100644
--- a/lite/kernels/arm/topk_compute.cc
+++ b/lite/kernels/arm/topk_compute.cc
@@ -25,7 +25,7 @@ void TopkCompute::Run() {
   auto& param = Param<operators::TopkParam>();
   const float* x_data = param.X->data<float>();
   float* out_val = param.Out->mutable_data<float>();
-  int* out_ind = param.Indices->mutable_data<int>();
+  auto out_ind = param.Indices->mutable_data<int64_t>();
   DDim x_dims = param.X->dims();
   int K = param.K;
   int dim_size = x_dims.size();
@@ -44,5 +44,5 @@ REGISTER_LITE_KERNEL(
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Indices",
-                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
     .Finalize();
diff --git a/lite/kernels/arm/write_to_array_compute.cc b/lite/kernels/arm/write_to_array_compute.cc
index a394c28a698c278dea7ded51ae016b777d2a0971..6b82f991268bdd10d7c32f36e27ca7532384a799 100644
--- a/lite/kernels/arm/write_to_array_compute.cc
+++ b/lite/kernels/arm/write_to_array_compute.cc
@@ -24,33 +24,12 @@ void WriteToArrayCompute::Run() {
   auto& ctx = this->ctx_->template As<ARMContext>();
   auto& param = this->template Param<operators::WriteToArrayParam>();
   CHECK_EQ(param.I->numel(), 1) << "input2 should have only one element";
-  auto precision_type = param.X->precision();
 
-#define SOLVE_TYPE(type__, T)                                       \
-  case type__: {                                                    \
-    const auto* x_data = param.X->data<T>();                        \
-    int id = param.I->data<int64_t>()[0];                           \
-    if (id >= param.Out->size()) {                                  \
-      for (int i = param.Out->size(); i < id + 1; i++) {            \
-        lite::Tensor tmp;                                           \
-        param.Out->push_back(tmp);                                  \
-      }                                                             \
-    }                                                               \
-    (*param.Out)[id].Resize(param.X->dims());                       \
-    auto out_lod = (*param.Out)[id].mutable_lod();                  \
-    *out_lod = param.X->lod();                                      \
-    auto* o_data = (*param.Out)[id].mutable_data<T>(TARGET(kHost)); \
-    int input_size = param.X->numel();                              \
-    memcpy(o_data, x_data, sizeof(T) * input_size);                 \
-  } break;
-
-  switch (precision_type) {
-    SOLVE_TYPE(PRECISION(kFloat), float);
-    SOLVE_TYPE(PRECISION(kInt64), int64_t);
-    default:
-      LOG(FATAL) << "Unsupported precision type.";
+  int id = param.I->data<int64_t>()[0];
+  if (param.Out->size() < id + 1) {
+    param.Out->resize(id + 1);
   }
-#undef SOLVE_TYPE
+  param.Out->at(id).CopyDataFrom(*param.X);
 }
 
 }  // namespace arm
@@ -65,6 +44,7 @@ REGISTER_LITE_KERNEL(write_to_array,
                      paddle::lite::kernels::arm::WriteToArrayCompute,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
-    .BindInput("I", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorListTy(TARGET(kARM))})
+    .BindInput("I", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("Out",
+                {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/write_to_array_compute.h b/lite/kernels/arm/write_to_array_compute.h
index 8235f9dae3fec639312f12faf08e764e79ab0bd5..960c53d4ef61269080df520574cb21127ebbaaf3 100644
--- a/lite/kernels/arm/write_to_array_compute.h
+++ b/lite/kernels/arm/write_to_array_compute.h
@@ -25,8 +25,6 @@ namespace arm {
 
 class WriteToArrayCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  public:
-  using param_t = operators::WriteToArrayParam;
-
   void Run() override;
 
   ~WriteToArrayCompute() {}
diff --git a/lite/kernels/bm/bridges/CMakeLists.txt b/lite/kernels/bm/bridges/CMakeLists.txt
index 688e307a6475073461415da2ca1c8f2cc6c88aac..75375f493fe9b6b1f436ef679a7ea8bd80e5ad0a 100644
--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
@@ -21,6 +21,16 @@ lite_cc_library(subgraph_bridge_transpose_op_bm SRCS transpose_op.cc DEPS ${bm_s
 lite_cc_library(subgraph_bridge_reshape_op_bm SRCS reshape_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_norm_op_bm SRCS norm_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_prior_box_op_bm SRCS prior_box_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_box_coder_op_bm SRCS box_coder_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_multiclass_nms_op_bm SRCS multiclass_nms_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_interpolate_op_bm SRCS interpolate_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_yolo_box_op_bm SRCS yolo_box_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_slice_op_bm SRCS slice_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_transpose_op_bm SRCS conv_transpose_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_reduce_full_op_bm SRCS reduce_full_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_squeeze_op_bm SRCS squeeze_op.cc DEPS ${bm_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_cast_op_bm SRCS cast_op.cc DEPS ${bm_subgraph_bridge_deps})
+
 set(bm_subgraph_bridges
         subgraph_bridge_registry
         subgraph_bridge_engine
@@ -39,4 +49,13 @@ set(bm_subgraph_bridges
         subgraph_bridge_reshape_op_bm
         subgraph_bridge_norm_op_bm
         subgraph_bridge_prior_box_op_bm
+        subgraph_bridge_box_coder_op_bm
+        subgraph_bridge_multiclass_nms_op_bm
+        subgraph_bridge_interpolate_op_bm
+        subgraph_bridge_yolo_box_op_bm
+        subgraph_bridge_slice_op_bm
+        subgraph_bridge_conv_transpose_op_bm
+        subgraph_bridge_reduce_full_op_bm
+        subgraph_bridge_squeeze_op_bm
+        subgraph_bridge_cast_op_bm
         CACHE INTERNAL "bm_subgraph_bridges")
diff --git a/lite/kernels/bm/bridges/act_op.cc b/lite/kernels/bm/bridges/act_op.cc
index 0d3c4e0b83598358958ae670e554949deb7d1926..091743157995ab1a00e798a6ac560454d4b22ae7 100644
--- a/lite/kernels/bm/bridges/act_op.cc
+++ b/lite/kernels/bm/bridges/act_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <bmcompiler_if.h>
+#include <bmcompiler_op_code.h>
 #include "lite/kernels/bm/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
 
@@ -46,22 +47,38 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
   }
   float alpha = 0.f;
+  int active_type_id = 0;
   if (op_type == "relu") {
   } else if (op_type == "leaky_relu") {
     alpha = op_info->GetAttr<float>("alpha");
+  } else if (op_type == "sqrt") {
+    active_type_id = ACTIVE_SQRT;
+  } else if (op_type == "square") {
+    active_type_id = ACTIVE_SQUARE;
   } else {
     LOG(FATAL) << "[BM] unsupport act type";
     return FAILED;
   }
-  add_relu_layer(graph->GetCompilerHandle(),
-                 const_cast<const int*>(&i_x_shape_data[0]),
-                 x_dims.size(),
-                 static_cast<const char*>(x_var_name.c_str()),
-                 const_cast<const int*>(&i_output_shape_data[0]),
-                 output_dims.size(),
-                 static_cast<const char*>(output_var_name.c_str()),
-                 alpha,
-                 -1.f);
+  if (op_type == "relu" || op_type == "leaky_relu") {
+    add_relu_layer(graph->GetCompilerHandle(),
+                   const_cast<const int*>(&i_x_shape_data[0]),
+                   x_dims.size(),
+                   static_cast<const char*>(x_var_name.c_str()),
+                   const_cast<const int*>(&i_output_shape_data[0]),
+                   output_dims.size(),
+                   static_cast<const char*>(output_var_name.c_str()),
+                   alpha,
+                   -1.f);
+  } else {
+    add_active_layer(graph->GetCompilerHandle(),
+                     const_cast<const int*>(&i_x_shape_data[0]),
+                     x_dims.size(),
+                     static_cast<const char*>(x_var_name.c_str()),
+                     const_cast<const int*>(&i_output_shape_data[0]),
+                     output_dims.size(),
+                     static_cast<const char*>(output_var_name.c_str()),
+                     active_type_id);
+  }
   graph->AddNode(output_var_name);
   return SUCCESS;
 }
@@ -75,3 +92,5 @@ REGISTER_SUBGRAPH_BRIDGE(relu, kBM, paddle::lite::subgraph::bm::ActConverter);
 REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
                          kBM,
                          paddle::lite::subgraph::bm::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(sqrt, kBM, paddle::lite::subgraph::bm::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(square, kBM, paddle::lite::subgraph::bm::ActConverter);
diff --git a/lite/kernels/bm/bridges/box_coder_op.cc b/lite/kernels/bm/bridges/box_coder_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..67f5104c8ba2353af19e3f7e8c3e734ecbbaacd3
--- /dev/null
+++ b/lite/kernels/bm/bridges/box_coder_op.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <bmcompiler_if.h>
+#include <user_bmcpu_common.h>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int BoxCoderConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto box_var_name = op_info->Input("PriorBox").front();
+  auto box = scope->FindVar(box_var_name)->GetMutable<lite::Tensor>();
+  auto box_dims = box->dims();
+  auto box_var_var_name = op_info->Input("PriorBoxVar").front();
+  auto box_var = scope->FindVar(box_var_var_name)->GetMutable<lite::Tensor>();
+  auto box_var_dims = box_var->dims();
+  auto target_box_var_name = op_info->Input("TargetBox").front();
+  auto target_box =
+      scope->FindVar(target_box_var_name)->GetMutable<lite::Tensor>();
+  auto target_box_dims = target_box->dims();
+  auto output_var_name = op_info->Output("OutputBox").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+
+  std::vector<int32_t> i_box_shape_data(box_dims.size());
+  for (size_t i = 0; i < box_dims.size(); i++) {
+    i_box_shape_data[i] = static_cast<int32_t>(box_dims[i]);
+  }
+  std::vector<int32_t> i_box_var_shape_data(box_var_dims.size());
+  for (size_t i = 0; i < box_var_dims.size(); i++) {
+    i_box_var_shape_data[i] = static_cast<int32_t>(box_var_dims[i]);
+  }
+  std::vector<int32_t> i_target_box_shape_data(target_box_dims.size());
+  for (size_t i = 0; i < target_box_dims.size(); i++) {
+    i_target_box_shape_data[i] = static_cast<int32_t>(target_box_dims[i]);
+  }
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int32_t>(output_dims[i]);
+  }
+  auto code_type = op_info->GetAttr<std::string>("code_type");
+  auto box_normalized = op_info->GetAttr<bool>("box_normalized");
+  int32_t axis = 0;
+  if (op_info->HasAttr("axis")) {
+    axis = op_info->GetAttr<int32_t>("axis");
+  }
+  std::vector<float> variance;
+  if (op_info->HasAttr("variance")) {
+    variance = op_info->GetAttr<std::vector<float>>("variance");
+  }
+  user_cpu_param_t bm_param;
+  bm_param.op_type = USER_PADDLE_BOX_CODER;
+  bm_param.u.box_coder_param.axis = axis;
+  bm_param.u.box_coder_param.variance = &variance[0];
+  bm_param.u.box_coder_param.code_type =
+      (code_type == "encode_center_size") ? 0 : 1;
+  bm_param.u.box_coder_param.normalized = box_normalized;
+  int32_t input_num = 3;
+  int32_t output_num = 1;
+  int32_t* in_shape[3];
+  int32_t in_dim[3];
+  const char* in_name[3];
+  in_shape[0] = &i_box_shape_data[0];
+  in_shape[1] = &i_target_box_shape_data[0];
+  in_shape[2] = &i_box_var_shape_data[0];
+  in_dim[0] = box_dims.size();
+  in_dim[1] = target_box_dims.size();
+  in_dim[2] = box_var_dims.size();
+  in_name[0] = static_cast<const char*>(box_var_name.c_str());
+  in_name[1] = static_cast<const char*>(target_box_var_name.c_str());
+  in_name[2] = static_cast<const char*>(box_var_var_name.c_str());
+  int32_t* out_shape[1];
+  int32_t out_dim[1];
+  const char* out_name[1];
+  out_shape[0] = &i_output_shape_data[0];
+  out_dim[0] = output_dims.size();
+  out_name[0] = static_cast<const char*>(output_var_name.c_str());
+
+  add_user_cpu_layer(graph->GetCompilerHandle(),
+                     input_num,
+                     in_shape,
+                     in_dim,
+                     in_name,
+                     output_num,
+                     out_shape,
+                     out_dim,
+                     out_name,
+                     &bm_param,
+                     static_cast<int>(sizeof(bm_param)));
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(box_coder,
+                         kBM,
+                         paddle::lite::subgraph::bm::BoxCoderConverter);
diff --git a/lite/kernels/bm/bridges/cast_op.cc b/lite/kernels/bm/bridges/cast_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..33be20685b686ac6d380234b7a2e5359c258c312
--- /dev/null
+++ b/lite/kernels/bm/bridges/cast_op.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_defs.h>
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+bool CvtDtype(int dtype, int* ptype) {
+  switch (dtype) {
+    case 21:
+      *ptype = DTYPE_INT8;
+      break;
+    case 1:
+      *ptype = DTYPE_INT16;
+      break;
+    case 2:
+      *ptype = DTYPE_FP32;
+      break;
+    case 5:
+      *ptype = DTYPE_FP32;
+      break;
+    default:
+      LOG(WARNING) << "[BM] unsupported date type: " << dtype;
+      return false;
+  }
+  return true;
+}
+
+int CastConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto output_var_name = op_info->Output("Out").front();
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_dims[i]);
+  }
+
+  int in_dtype = op_info->GetAttr<int>("in_dtype");
+  int out_dtype = op_info->GetAttr<int>("out_dtype");
+
+  if (in_dtype == out_dtype) {
+    add_identity_layer(graph->GetCompilerHandle(),
+                       static_cast<const char*>(x_var_name.c_str()),
+                       const_cast<const int*>(&i_x_shape_data[0]),
+                       x_dims.size(),
+                       static_cast<const char*>(output_var_name.c_str()));
+  } else {
+    int out_bm_dtype = 0;
+    CHECK_EQ(CvtDtype(out_dtype, &out_bm_dtype), true);
+    add_shape_cast_layer(graph->GetCompilerHandle(),
+                         static_cast<const char*>(x_var_name.c_str()),
+                         static_cast<const char*>(output_var_name.c_str()),
+                         out_bm_dtype);
+  }
+
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(cast, kBM, paddle::lite::subgraph::bm::CastConverter);
diff --git a/lite/kernels/bm/bridges/concat_op.cc b/lite/kernels/bm/bridges/concat_op.cc
index 0b568aa4d161b5af8d17a83cdedddc446fcd8237..1fa8032885734ee4a924d44e90e902c0ec779044 100644
--- a/lite/kernels/bm/bridges/concat_op.cc
+++ b/lite/kernels/bm/bridges/concat_op.cc
@@ -30,8 +30,6 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto op_type = op_info->Type();
   // input
   auto x_names = op_info->Input("X");
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   // output
   auto output_var_name = op_info->Output("Out").front();
   auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
@@ -57,7 +55,6 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       shape[i][j] = static_cast<int32_t>(x_shape_data[j]);
     }
   }
-
   auto axis = op_info->GetAttr<int>("axis");
   add_concat_layer(graph->GetCompilerHandle(),
                    input_num,
diff --git a/lite/kernels/bm/bridges/conv_op.cc b/lite/kernels/bm/bridges/conv_op.cc
index ffe5a59aca8124a0f7999a71b35947d11e37b4fe..e4dff107024c02dcfe25afe37723b7d2418369b5 100644
--- a/lite/kernels/bm/bridges/conv_op.cc
+++ b/lite/kernels/bm/bridges/conv_op.cc
@@ -55,7 +55,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       const_cast<const int64_t*>(&output_dims.data()[0]);
   std::vector<int32_t> i_input_shape_data(input_dims.size());
   std::vector<int32_t> i_output_shape_data(output_dims.size());
-
   for (size_t i = 0; i < input_dims.size(); i++) {
     i_input_shape_data[i] = static_cast<int32_t>(input_shape_data[i]);
   }
diff --git a/lite/kernels/bm/bridges/conv_transpose_op.cc b/lite/kernels/bm/bridges/conv_transpose_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b875feaa03297c39f9f34ca4b710ea40c1b3ad8a
--- /dev/null
+++ b/lite/kernels/bm/bridges/conv_transpose_op.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int ConvTransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
+  auto input_var_name = op_info->Input("Input").front();
+  auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  auto input_dims = input->dims();
+  auto output_var_name = op_info->Output("Output").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  auto filter_var_name = op_info->Input("Filter").front();
+  auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
+  auto filter_dims = filter->dims();
+  CHECK_EQ(input_dims.size(), 4);
+  CHECK_EQ(output_dims.size(), 4);
+  CHECK_EQ(filter_dims.size(), 4);
+  bool has_bias = lite::subgraph::bm::HasInputArg(op_info, scope, "Bias");
+  float* bias_data = nullptr;
+  if (has_bias) {
+    auto bias_var_name = op_info->Input("Bias").front();
+    auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+    bias_data = static_cast<float*>(bias->mutable_data<float>());
+  }
+  const int64_t* input_shape_data =
+      const_cast<const int64_t*>(&input_dims.data()[0]);
+  const int64_t* output_shape_data =
+      const_cast<const int64_t*>(&output_dims.data()[0]);
+  std::vector<int32_t> i_input_shape_data(input_dims.size());
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+
+  for (size_t i = 0; i < input_dims.size(); i++) {
+    i_input_shape_data[i] = static_cast<int32_t>(input_shape_data[i]);
+  }
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int32_t>(output_shape_data[i]);
+  }
+  const float* filter_data =
+      const_cast<const float*>(filter->mutable_data<float>());
+  auto groups = op_info->GetAttr<int>("groups");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+
+  bool fuse_relu = false;
+  if (op_info->HasAttr("fuse_relu")) {
+    fuse_relu = op_info->GetAttr<bool>("fuse_relu");
+  }
+  CHECK_EQ(fuse_relu, false);
+  add_deconv_layer(graph->GetCompilerHandle(),
+                   const_cast<const int*>(&i_input_shape_data[0]),
+                   input_dims.size(),
+                   static_cast<const char*>(input_var_name.c_str()),
+                   const_cast<const int*>(&i_output_shape_data[0]),
+                   output_dims.size(),
+                   static_cast<const char*>(output_var_name.c_str()),
+                   static_cast<const char*>(unique_op_name.c_str()),
+                   filter_data,
+                   bias_data,
+                   filter_dims.data()[2],
+                   filter_dims.data()[3],
+                   groups,
+                   paddings[0],
+                   paddings[0],
+                   paddings[1],
+                   paddings[1],
+                   strides[0],
+                   strides[1],
+                   dilations[0],
+                   dilations[1],
+                   static_cast<int>(has_bias));
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(conv2d_transpose,
+                         kBM,
+                         paddle::lite::subgraph::bm::ConvTransposeConverter);
diff --git a/lite/kernels/bm/bridges/elementwise_ops.cc b/lite/kernels/bm/bridges/elementwise_ops.cc
index 2fdbfd8c3f74879a52f5d3a8057953ab800887ef..3006a8b6fdaef5a250af1b2e764aff9f2913898e 100644
--- a/lite/kernels/bm/bridges/elementwise_ops.cc
+++ b/lite/kernels/bm/bridges/elementwise_ops.cc
@@ -71,18 +71,14 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   }
   auto axis = op_info->GetAttr<int>("axis");
   int op_code{-1};
-  int eltwise_if_code{-1};
-  float coeff[2] = {1.f, 1.f};
   if (op_type == "elementwise_mul") {
     op_code = BINARY_MUL;
-    eltwise_if_code = 0;
   } else if (op_type == "elementwise_add") {
     op_code = BINARY_ADD;
-    eltwise_if_code = 1;
   } else if (op_type == "elementwise_sub") {
     op_code = BINARY_SUB;
-    eltwise_if_code = 1;
-    coeff[1] = -1.f;
+  } else if (op_type == "elementwise_div") {
+    op_code = BINARY_DIV;
   } else {
     LOG(FATAL) << "UNSUPPORTED ELTWISE OPERATION: " << op_type;
   }
@@ -115,31 +111,21 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       shape[1] = &i_expand_shape_data[0];
       y_data = nullptr;
     }
-    add_binary_layer_v2(graph->GetCompilerHandle(),
-                        name[0],
-                        shape[0],
-                        dim[0],
-                        0,
-                        static_cast<const float*>(x_data),
-                        name[1],
-                        shape[1],
-                        dim[1],
-                        0,
-                        static_cast<const float*>(y_data),
-                        static_cast<const char*>(output_var_name.c_str()),
-                        op_code);
-  } else {
-    add_eltwise_layer(graph->GetCompilerHandle(),
-                      input_num,
-                      shape,
-                      dim,
-                      name,
-                      const_cast<const int*>(&i_output_shape_data[0]),
-                      output_dims.size(),
-                      static_cast<const char*>(output_var_name.c_str()),
-                      eltwise_if_code,
-                      coeff);
   }
+  add_binary_layer_v2(graph->GetCompilerHandle(),
+                      name[0],
+                      shape[0],
+                      dim[0],
+                      0,
+                      static_cast<const float*>(x_data),
+                      name[1],
+                      shape[1],
+                      dim[1],
+                      0,
+                      static_cast<const float*>(y_data),
+                      static_cast<const char*>(output_var_name.c_str()),
+                      op_code);
+
   delete[] shape;
   delete[] name;
   delete[] dim;
@@ -161,3 +147,6 @@ REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
 REGISTER_SUBGRAPH_BRIDGE(elementwise_sub,
                          kBM,
                          paddle::lite::subgraph::bm::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_div,
+                         kBM,
+                         paddle::lite::subgraph::bm::ElementwiseConverter);
diff --git a/lite/kernels/bm/bridges/graph.cc b/lite/kernels/bm/bridges/graph.cc
index 81dedb30c6d3965b0fcf416133c77d4a5f0a24d1..32b10f5020605f8f55a2dee3ae2861e591f2f6ed 100644
--- a/lite/kernels/bm/bridges/graph.cc
+++ b/lite/kernels/bm/bridges/graph.cc
@@ -25,7 +25,11 @@ void Graph::AddNode(const std::string& name) {
 }
 
 void Graph::CreateCompilerHandle() {
+#ifdef BM1682
+  compiler_handle_ = create_bmcompiler("BM1682");
+#else
   compiler_handle_ = create_bmcompiler("BM1684");
+#endif
   CHECK(compiler_handle_ != nullptr);
 }
 
diff --git a/lite/kernels/bm/bridges/interpolate_op.cc b/lite/kernels/bm/bridges/interpolate_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8a744d5f2a894efc725979fd1b4f2c8af7cb0816
--- /dev/null
+++ b/lite/kernels/bm/bridges/interpolate_op.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int32_t>(x_dims[i]);
+  }
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output_dims = output->dims();
+  std::vector<int32_t> i_output_shape_data(output_dims.size());
+  for (size_t i = 0; i < output_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int32_t>(output_dims[i]);
+  }
+  auto scale = op_info->GetAttr<float>("scale");
+  int32_t i_scale = static_cast<int32_t>(scale);
+  bool is_int = false;
+  if ((scale - i_scale) < 0.000001f) {
+    is_int = true;
+  }
+  int32_t type = 0;
+  if (op_type == "nearest_interp") {
+    type = 2;
+  } else {
+    type = 0;
+  }
+  if (type == 2 && is_int) {
+    add_upsample_layer(graph->GetCompilerHandle(),
+                       const_cast<const int*>(&i_x_shape_data[0]),
+                       x_dims.size(),
+                       static_cast<const char*>(x_var_name.c_str()),
+                       const_cast<const int*>(&i_output_shape_data[0]),
+                       output_dims.size(),
+                       static_cast<const char*>(output_var_name.c_str()),
+                       i_scale);
+  } else {
+    add_interp_layer_v2(graph->GetCompilerHandle(),
+                        const_cast<const int*>(&i_x_shape_data[0]),
+                        x_dims.size(),
+                        static_cast<const char*>(x_var_name.c_str()),
+                        1,
+                        const_cast<const int*>(&i_output_shape_data[0]),
+                        output_dims.size(),
+                        nullptr,
+                        static_cast<const char*>(output_var_name.c_str()),
+                        0,
+                        0,
+                        type);
+  }
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(nearest_interp,
+                         kBM,
+                         paddle::lite::subgraph::bm::InterpolateConverter);
+REGISTER_SUBGRAPH_BRIDGE(bilinear_interp,
+                         kBM,
+                         paddle::lite::subgraph::bm::InterpolateConverter);
diff --git a/lite/kernels/bm/bridges/multiclass_nms_op.cc b/lite/kernels/bm/bridges/multiclass_nms_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e7520f27242271ba297b7ff02b34ea3b35a763b
--- /dev/null
+++ b/lite/kernels/bm/bridges/multiclass_nms_op.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include <user_bmcpu_common.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int MultiClassNMSConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto boxes_var_name = op_info->Input("BBoxes").front();
+  auto boxes = scope->FindVar(boxes_var_name)->GetMutable<lite::Tensor>();
+  auto boxes_dims = boxes->dims();
+  std::vector<int32_t> i_boxes_shape_data(boxes_dims.size());
+  for (size_t i = 0; i < boxes_dims.size(); i++) {
+    i_boxes_shape_data[i] = static_cast<int32_t>(boxes_dims[i]);
+  }
+  auto score_var_name = op_info->Input("Scores").front();
+  auto score = scope->FindVar(score_var_name)->GetMutable<lite::Tensor>();
+  auto score_dims = score->dims();
+  std::vector<int32_t> i_score_shape_data(score_dims.size());
+  for (size_t i = 0; i < score_dims.size(); i++) {
+    i_score_shape_data[i] = static_cast<int32_t>(score_dims[i]);
+  }
+
+  auto out_var_name = op_info->Output("Out").front();
+  auto out = scope->FindVar(out_var_name)->GetMutable<lite::Tensor>();
+  auto out_dims = out->dims();
+  std::vector<int32_t> i_out_shape_data(out_dims.size());
+  for (size_t i = 0; i < out_dims.size(); i++) {
+    i_out_shape_data[i] = static_cast<int32_t>(out_dims[i]);
+  }
+
+  auto background_label = op_info->GetAttr<int>("background_label");
+  auto keep_top_k = op_info->GetAttr<int>("keep_top_k");
+  auto nms_top_k = op_info->GetAttr<int>("nms_top_k");
+  auto score_threshold = op_info->GetAttr<float>("score_threshold");
+  auto nms_threshold = op_info->GetAttr<float>("nms_threshold");
+  auto nms_eta = op_info->GetAttr<float>("nms_eta");
+  bool normalized;
+  if (op_info->HasAttr("normalized")) {
+    normalized = op_info->GetAttr<bool>("normalized");
+  }
+
+  user_cpu_param_t bm_param;
+  bm_param.op_type = USER_PADDLE_MULTICLASS_NMS;
+  bm_param.u.multiclass_nms_param.background_label = background_label;
+  bm_param.u.multiclass_nms_param.score_threshold = score_threshold;
+  bm_param.u.multiclass_nms_param.keep_top_k = keep_top_k;
+  bm_param.u.multiclass_nms_param.nms_top_k = nms_top_k;
+  bm_param.u.multiclass_nms_param.nms_threshold = nms_threshold;
+  bm_param.u.multiclass_nms_param.nms_eta = nms_eta;
+  bm_param.u.multiclass_nms_param.normalized = normalized;
+
+  int32_t input_num = 2;
+  int32_t output_num = 1;
+  int32_t* in_shape[2];
+  int32_t in_dim[2];
+  const char* in_name[2];
+  in_shape[0] = &i_boxes_shape_data[0];
+  in_shape[1] = &i_score_shape_data[0];
+  in_dim[0] = boxes_dims.size();
+  in_dim[1] = score_dims.size();
+  in_name[0] = static_cast<const char*>(boxes_var_name.c_str());
+  in_name[1] = static_cast<const char*>(score_var_name.c_str());
+  int32_t* out_shape[1];
+  int32_t out_dim[1];
+  const char* out_name[1];
+  i_out_shape_data[0] = keep_top_k;
+  i_out_shape_data[1] = 6;
+  out_shape[0] = &i_out_shape_data[0];
+  out_dim[0] = 2;
+  out_name[0] = static_cast<const char*>(out_var_name.c_str());
+
+  add_user_cpu_layer(graph->GetCompilerHandle(),
+                     input_num,
+                     in_shape,
+                     in_dim,
+                     in_name,
+                     output_num,
+                     out_shape,
+                     out_dim,
+                     out_name,
+                     &bm_param,
+                     static_cast<int>(sizeof(bm_param)));
+  graph->AddNode(out_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(multiclass_nms,
+                         kBM,
+                         paddle::lite::subgraph::bm::MultiClassNMSConverter);
diff --git a/lite/kernels/bm/bridges/paddle_use_bridges.h b/lite/kernels/bm/bridges/paddle_use_bridges.h
index fdaf70de6a4777ae016326a22721c845a79b7d93..8dbbb53d810952743228d96d60d7927965d2d527 100644
--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
@@ -21,6 +21,7 @@ USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kBM);
 USE_SUBGRAPH_BRIDGE(elementwise_add, kBM);
 USE_SUBGRAPH_BRIDGE(elementwise_mul, kBM);
 USE_SUBGRAPH_BRIDGE(elementwise_sub, kBM);
+USE_SUBGRAPH_BRIDGE(elementwise_div, kBM);
 USE_SUBGRAPH_BRIDGE(pool2d, kBM);
 USE_SUBGRAPH_BRIDGE(softmax, kBM);
 USE_SUBGRAPH_BRIDGE(mul, kBM);
@@ -36,3 +37,17 @@ USE_SUBGRAPH_BRIDGE(flatten, kBM);
 USE_SUBGRAPH_BRIDGE(flatten2, kBM);
 USE_SUBGRAPH_BRIDGE(norm, kBM);
 USE_SUBGRAPH_BRIDGE(prior_box, kBM);
+USE_SUBGRAPH_BRIDGE(box_coder, kBM);
+USE_SUBGRAPH_BRIDGE(multiclass_nms, kBM);
+USE_SUBGRAPH_BRIDGE(nearest_interp, kBM);
+USE_SUBGRAPH_BRIDGE(bilinear_interp, kBM);
+USE_SUBGRAPH_BRIDGE(yolo_box, kBM);
+USE_SUBGRAPH_BRIDGE(sqrt, kBM);
+USE_SUBGRAPH_BRIDGE(square, kBM);
+USE_SUBGRAPH_BRIDGE(slice, kBM);
+USE_SUBGRAPH_BRIDGE(conv2d_transpose, kBM);
+USE_SUBGRAPH_BRIDGE(reduce_sum, kBM);
+USE_SUBGRAPH_BRIDGE(reduce_mean, kBM);
+USE_SUBGRAPH_BRIDGE(squeeze, kBM);
+USE_SUBGRAPH_BRIDGE(squeeze2, kBM);
+USE_SUBGRAPH_BRIDGE(cast, kBM);
diff --git a/lite/kernels/bm/bridges/prior_box_op.cc b/lite/kernels/bm/bridges/prior_box_op.cc
index 17c3fbf03473a480e0bb736241e6095055999098..de30d0e3183af27f3f7b3b227b2b5261fe654b3e 100644
--- a/lite/kernels/bm/bridges/prior_box_op.cc
+++ b/lite/kernels/bm/bridges/prior_box_op.cc
@@ -83,127 +83,106 @@ float* compute_priorbox_kernel(OpLite* op, st_priorbox_param* param) {
   for (size_t i = 0; i < expand_aspect_ratios.size(); i++) {
     param->aspect_ratios.push_back(expand_aspect_ratios[i]);
   }
-  param->prior_num = param->aspect_ratios.size() * param->min_sizes.size();
+
+  auto img_width = img_dims[3];
+  auto img_height = img_dims[2];
+  auto feature_width = in_dims[3];
+  auto feature_height = in_dims[2];
+  float step_width, step_height;
+  if (param->step_w == 0.f || param->step_h == 0.f) {
+    step_width = static_cast<float>(img_width) / feature_width;
+    step_height = static_cast<float>(img_height) / feature_height;
+  } else {
+    step_width = param->step_w;
+    step_height = param->step_h;
+  }
+  int num_priors = param->aspect_ratios.size() * param->min_sizes.size();
   if (param->max_sizes.size() > 0) {
-    param->prior_num += param->max_sizes.size();
+    num_priors += param->max_sizes.size();
   }
-  int32_t win1 = in_dims[3];
-  int32_t hin1 = in_dims[2];
-  DDim shape_out({hin1, win1, param->prior_num, 4});
+  param->prior_num = num_priors;
+  DDim shape_out({feature_height, feature_width, num_priors, 4});
+  int32_t channel_size = feature_height * feature_width * num_priors * 4;
   boxes->Resize(shape_out);
   var->Resize(shape_out);
-  // boxes->mutable_data<float>();
-  // var->mutable_data<float>();
   float* cpu_data =
       static_cast<float*>(malloc(sizeof(float) * boxes->data_size() * 2));
   CHECK(cpu_data != nullptr);
-  const int32_t width = in_dims[3];
-  const int32_t height = in_dims[2];
-  int32_t img_width = param->img_w;
-  int32_t img_height = param->img_h;
-  if (img_width == 0 || img_height == 0) {
-    img_width = img_dims[3];
-    img_height = img_dims[2];
-  }
-  float step_w = param->step_w;
-  float step_h = param->step_h;
-  if (step_w == 0.f || step_h == 0.f) {
-    step_w = static_cast<float>(img_width) / width;
-    step_h = static_cast<float>(img_height) / height;
-  }
-  float offset = param->offset;
-  int32_t channel_size = height * width * param->prior_num * 4;
-  int32_t idx = 0;
-  ///////////////////////////////////////////////////////////////////////
-  for (int32_t h = 0; h < height; ++h) {
-    for (int32_t w = 0; w < width; ++w) {
-      float center_x = (w + offset) * step_w;
-      float center_y = (h + offset) * step_h;
-      float box_width = 0.f;
-      float box_height = 0.f;
-      float* min_buf = reinterpret_cast<float*>(malloc(sizeof(float) * 4));
-      float* max_buf = reinterpret_cast<float*>(malloc(sizeof(float) * 4));
-      float* com_buf = reinterpret_cast<float*>(
-          malloc(sizeof(float) * expand_aspect_ratios.size() * 4));
-      CHECK(min_buf != nullptr);
-      CHECK(max_buf != nullptr);
-      CHECK(com_buf != nullptr);
-      // LOG(INFO) << "the number of min_size is " << min_sizes_.size();
+  float* b_t = cpu_data;
+  for (int h = 0; h < feature_height; ++h) {
+    for (int w = 0; w < feature_width; ++w) {
+      float center_x = (w + param->offset) * step_width;
+      float center_y = (h + param->offset) * step_height;
+      float box_width, box_height;
       for (size_t s = 0; s < param->min_sizes.size(); ++s) {
-        int32_t min_idx = 0;
-        int32_t max_idx = 0;
-        int32_t com_idx = 0;
-        int32_t min_size = param->min_sizes[s];
-        //! first prior: aspect_ratio = 1, size = min_size
-        box_width = box_height = min_size;
-        //! xmin
-        min_buf[min_idx++] = (center_x - box_width / 2.f) / img_width;
-        //! ymin
-        min_buf[min_idx++] = (center_y - box_height / 2.f) / img_height;
-        //! xmax
-        min_buf[min_idx++] = (center_x + box_width / 2.f) / img_width;
-        //! ymax
-        min_buf[min_idx++] = (center_y + box_height / 2.f) / img_height;
-        if (param->max_sizes.size() > 0) {
-          int max_size = param->max_sizes[s];
-          //! second prior: aspect_ratio = 1, size = sqrt(min_size * max_size)
-          box_width = box_height = sqrtf(min_size * max_size);
-          //! xmin
-          max_buf[max_idx++] = (center_x - box_width / 2.f) / img_width;
-          //! ymin
-          max_buf[max_idx++] = (center_y - box_height / 2.f) / img_height;
-          //! xmax
-          max_buf[max_idx++] = (center_x + box_width / 2.f) / img_width;
-          //! ymax
-          max_buf[max_idx++] = (center_y + box_height / 2.f) / img_height;
-        }
-        //! rest of priors
-        for (size_t r = 0; r < expand_aspect_ratios.size(); ++r) {
-          float ar = expand_aspect_ratios[r];
-          if (fabs(ar - 1.) < 1e-6) {
-            continue;
-          }
-          box_width = min_size * sqrt(ar);
-          box_height = min_size / sqrt(ar);
-          //! xmin
-          com_buf[com_idx++] = (center_x - box_width / 2.f) / img_width;
-          //! ymin
-          com_buf[com_idx++] = (center_y - box_height / 2.f) / img_height;
-          //! xmax
-          com_buf[com_idx++] = (center_x + box_width / 2.f) / img_width;
-          //! ymax
-          com_buf[com_idx++] = (center_y + box_height / 2.f) / img_height;
-        }
+        auto min_size = param->min_sizes[s];
         if (param->min_max_aspect_ratios_order) {
-          memcpy(cpu_data + idx, min_buf, sizeof(float) * min_idx);
-          idx += min_idx;
-          memcpy(cpu_data + idx, max_buf, sizeof(float) * max_idx);
-          idx += max_idx;
-          memcpy(cpu_data + idx, com_buf, sizeof(float) * com_idx);
-          idx += com_idx;
+          box_width = box_height = min_size / 2.;
+          b_t[0] = (center_x - box_width) / img_width;
+          b_t[1] = (center_y - box_height) / img_height;
+          b_t[2] = (center_x + box_width) / img_width;
+          b_t[3] = (center_y + box_height) / img_height;
+          b_t += 4;
+          if (param->max_sizes.size() > 0) {
+            auto max_size = param->max_sizes[s];
+            // square prior with size sqrt(minSize * maxSize)
+            box_width = box_height = sqrt(min_size * max_size) / 2.;
+            b_t[0] = (center_x - box_width) / img_width;
+            b_t[1] = (center_y - box_height) / img_height;
+            b_t[2] = (center_x + box_width) / img_width;
+            b_t[3] = (center_y + box_height) / img_height;
+            b_t += 4;
+          }
+          // priors with different aspect ratios
+          for (size_t r = 0; r < param->aspect_ratios.size(); ++r) {
+            float ar = param->aspect_ratios[r];
+            if (fabs(ar - 1.) < 1e-6) {
+              continue;
+            }
+            box_width = min_size * sqrt(ar) / 2.;
+            box_height = min_size / sqrt(ar) / 2.;
+            b_t[0] = (center_x - box_width) / img_width;
+            b_t[1] = (center_y - box_height) / img_height;
+            b_t[2] = (center_x + box_width) / img_width;
+            b_t[3] = (center_y + box_height) / img_height;
+            b_t += 4;
+          }
         } else {
-          memcpy(cpu_data + idx, com_buf, sizeof(float) * com_idx);
-          idx += com_idx;
-          memcpy(cpu_data + idx, max_buf, sizeof(float) * max_idx);
-          idx += max_idx;
+          // priors with different aspect ratios
+          for (size_t r = 0; r < param->aspect_ratios.size(); ++r) {
+            float ar = param->aspect_ratios[r];
+            box_width = min_size * sqrt(ar) / 2.;
+            box_height = min_size / sqrt(ar) / 2.;
+            b_t[0] = (center_x - box_width) / img_width;
+            b_t[1] = (center_y - box_height) / img_height;
+            b_t[2] = (center_x + box_width) / img_width;
+            b_t[3] = (center_y + box_height) / img_height;
+            b_t += 4;
+          }
+          if (param->max_sizes.size() > 0) {
+            auto max_size = param->max_sizes[s];
+            // square prior with size sqrt(minSize * maxSize)
+            box_width = box_height = sqrt(min_size * max_size) / 2.;
+            b_t[0] = (center_x - box_width) / img_width;
+            b_t[1] = (center_y - box_height) / img_height;
+            b_t[2] = (center_x + box_width) / img_width;
+            b_t[3] = (center_y + box_height) / img_height;
+            b_t += 4;
+          }
         }
       }
-      free(min_buf);
-      free(max_buf);
-      free(com_buf);
     }
   }
-  //! clip the prior's coordidate such that it is within [0, 1]
+
   if (param->clip) {
     for (int32_t d = 0; d < channel_size; ++d) {
       cpu_data[d] = std::min(std::max(cpu_data[d], 0.f), 1.f);
     }
   }
-  //! set the variance.
   float* ptr = cpu_data + channel_size;
   int count = 0;
-  for (int32_t h = 0; h < height; ++h) {
-    for (int32_t w = 0; w < width; ++w) {
+  for (int32_t h = 0; h < feature_height; ++h) {
+    for (int32_t w = 0; w < feature_width; ++w) {
       for (int32_t i = 0; i < param->prior_num; ++i) {
         for (int j = 0; j < 4; ++j) {
           ptr[count] = param->variances[j];
@@ -237,7 +216,6 @@ int PriorBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto boxes_var_name = op_info->Output("Boxes").front();
   auto boxes = scope->FindVar(boxes_var_name)->GetMutable<lite::Tensor>();
   auto var_var_name = op_info->Output("Variances").front();
-  auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
   // param
   st_priorbox_param param;
   param.clip = op_info->GetAttr<bool>("clip");
@@ -269,20 +247,19 @@ int PriorBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
         op_info->GetAttr<bool>("min_max_aspect_ratios_order");
   }
   float* cpu_data = compute_priorbox_kernel(op, &param);
-  compute_priorbox_kernel(op, param);
   auto boxes_dims = boxes->dims();
-  std::vector<int32_t> i_pri_out_shape_data(boxes_dims.size());
-  for (size_t i = 0; i < boxes_dims.size(); i++) {
-    i_pri_out_shape_data[i] = static_cast<int32_t>(boxes_dims[i]);
-  }
-  i_pri_out_shape_data[0] *= 2;
+  std::vector<int32_t> i_pri_out_shape_data(3);
+  i_pri_out_shape_data[0] = 1;
+  i_pri_out_shape_data[1] = 2;
+  i_pri_out_shape_data[2] = boxes->data_size();
+  auto bm_priorbox_name = lite::subgraph::bm::UniqueName("bm_priorbox");
   add_priorbox_layer(graph->GetCompilerHandle(),
                      const_cast<const int*>(&i_input_shape_data[0]),
                      in_dims.size(),
                      static_cast<const char*>(in_var_name.c_str()),
                      const_cast<const int*>(&i_pri_out_shape_data[0]),
-                     boxes_dims.size(),
-                     static_cast<const char*>(unique_op_name.c_str()),
+                     3,
+                     static_cast<const char*>(bm_priorbox_name.c_str()),
                      static_cast<const float*>(cpu_data),
                      param.min_sizes.size(),
                      const_cast<const float*>(&param.min_sizes[0]),
@@ -299,32 +276,57 @@ int PriorBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                      param.step_h,
                      param.step_w,
                      param.offset);
-  std::vector<int32_t> i_output_shape_data(boxes_dims.size());
-  for (size_t i = 0; i < boxes_dims.size(); i++) {
-    i_output_shape_data[i] = static_cast<int32_t>(boxes_dims[i]);
-  }
   int32_t* shape[2];
-  int dim[2];
+  int32_t dim[2];
   const char* name[2];
-  dim[0] = boxes_dims.size();
-  dim[1] = boxes_dims.size();
-  name[0] = static_cast<const char*>(boxes_var_name.c_str());
-  name[1] = static_cast<const char*>(var_var_name.c_str());
-  shape[0] = &i_output_shape_data[0];
-  shape[1] = &i_output_shape_data[0];
-  int split_size = 2;
+  int32_t dim_size = 3;
+  dim[0] = dim_size;
+  dim[1] = dim_size;
+  std::vector<int32_t> i_split_shape_data(dim_size);
+  for (size_t i = 0; i < dim_size; i++) {
+    i_split_shape_data[i] = i_pri_out_shape_data[i];
+  }
+  i_split_shape_data[1] /= 2;
+  shape[0] = &i_split_shape_data[0];
+  shape[1] = &i_split_shape_data[0];
+  name[0] = static_cast<const char*>(
+      lite::subgraph::bm::UniqueName("bm_boxes").c_str());
+  name[1] = static_cast<const char*>(
+      lite::subgraph::bm::UniqueName("bm_boxes_var").c_str());
+  int split_size[2];
+  split_size[0] = shape[0][1];
+  split_size[1] = shape[1][1];
   add_tf_split_layer(graph->GetCompilerHandle(),
                      const_cast<const int*>(&i_pri_out_shape_data[0]),
-                     boxes_dims.size(),
-                     static_cast<const char*>(unique_op_name.c_str()),
+                     3,
+                     static_cast<const char*>(bm_priorbox_name.c_str()),
                      2,
                      shape,
                      dim,
                      name,
-                     boxes_dims.size(),
-                     0,
-                     &split_size,
-                     0);
+                     3,
+                     1,
+                     split_size,
+                     2);
+  // final output
+  std::vector<int32_t> i_output_shape_data(boxes_dims.size());
+  for (size_t i = 0; i < boxes_dims.size(); i++) {
+    i_output_shape_data[i] = static_cast<int32_t>(boxes_dims[i]);
+  }
+  add_reshape_layer_v2(graph->GetCompilerHandle(),
+                       name[0],
+                       shape[0],
+                       3,
+                       static_cast<const char*>(boxes_var_name.c_str()),
+                       const_cast<const int*>(&i_output_shape_data[0]),
+                       boxes_dims.size());
+  add_reshape_layer_v2(graph->GetCompilerHandle(),
+                       name[1],
+                       shape[1],
+                       3,
+                       static_cast<const char*>(var_var_name.c_str()),
+                       const_cast<const int*>(&i_output_shape_data[0]),
+                       boxes_dims.size());
   graph->AddNode(boxes_var_name);
   graph->AddNode(var_var_name);
   return SUCCESS;
diff --git a/lite/kernels/bm/bridges/reduce_full_op.cc b/lite/kernels/bm/bridges/reduce_full_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..401de8bfac184c89edaa7bf15510576d5c05f902
--- /dev/null
+++ b/lite/kernels/bm/bridges/reduce_full_op.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include <bmcompiler_op_code.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int ReduceFullConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto dim = op_info->GetAttr<std::vector<int32_t>>("dim");
+  auto keep_dim = op_info->GetAttr<bool>("keep_dim");
+  int op_code = -1;
+  if (op_type == "reduce_sum") {
+    op_code = REDUCE_SUM;
+  } else if (op_type == "reduce_mean") {
+    op_code = REDUCE_MEAN;
+  }
+
+  add_reduce_full_layer(graph->GetCompilerHandle(),
+                        static_cast<const char*>(x_var_name.c_str()),
+                        static_cast<const char*>(output_var_name.c_str()),
+                        const_cast<const int*>(&i_x_shape_data[0]),
+                        x_dims.size(),
+                        const_cast<const int*>(&dim[0]),
+                        dim.size(),
+                        op_code,
+                        static_cast<int>(keep_dim));
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(reduce_sum,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReduceFullConverter);
+REGISTER_SUBGRAPH_BRIDGE(reduce_mean,
+                         kBM,
+                         paddle::lite::subgraph::bm::ReduceFullConverter);
diff --git a/lite/kernels/bm/bridges/slice_op.cc b/lite/kernels/bm/bridges/slice_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9e020e6fec36fedfb896e040d85a59a0a637d9ba
--- /dev/null
+++ b/lite/kernels/bm/bridges/slice_op.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include <bmcompiler_op_code.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int SliceConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // input
+  auto input_var_name = op_info->Input("Input").front();
+  auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  auto input_dims = input->dims();
+  const int64_t* input_shape_data =
+      const_cast<const int64_t*>(&input_dims.data()[0]);
+  std::vector<int32_t> i_input_shape_data(input_dims.size());
+  for (size_t i = 0; i < input_dims.size(); i++) {
+    i_input_shape_data[i] = static_cast<int>(input_shape_data[i]);
+  }
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto axes = op_info->GetAttr<std::vector<int32_t>>("axes");
+  auto starts = op_info->GetAttr<std::vector<int32_t>>("starts");
+  auto ends = op_info->GetAttr<std::vector<int32_t>>("ends");
+
+  std::vector<int32_t> begin_index(input_dims.size(), 0);
+  std::vector<int32_t> end_index(input_dims.size(), -1);
+  std::vector<int32_t> strides(input_dims.size(), 1);
+  int32_t begin_mask = 0;
+  int32_t end_mask = 0;
+  for (size_t i = 0; i < input_dims.size(); i++) {
+    begin_mask |= (1 << i);
+    end_mask |= (1 << i);
+  }
+  for (size_t i = 0; i < axes.size(); i++) {
+    begin_index[axes[i]] = starts[i];
+    end_index[axes[i]] = ends[i] > static_cast<int32_t>(input_dims.size())
+                             ? static_cast<int32_t>(input_dims.size())
+                             : ends[i];
+    begin_mask &= ~(1 << axes[i]);
+    end_mask &= ~(1 << axes[i]);
+  }
+
+  add_stride_slice_layer_v2(graph->GetCompilerHandle(),
+                            static_cast<const char*>(input_var_name.c_str()),
+                            const_cast<const int*>(&i_input_shape_data[0]),
+                            input_dims.size(),
+                            static_cast<const char*>(output_var_name.c_str()),
+                            begin_index.data(),
+                            end_index.data(),
+                            strides.data(),
+                            input_dims.size(),
+                            begin_mask,
+                            end_mask,
+                            0,
+                            0,
+                            0);
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(slice,
+                         kBM,
+                         paddle::lite::subgraph::bm::SliceConverter);
diff --git a/lite/kernels/bm/bridges/squeeze_op.cc b/lite/kernels/bm/bridges/squeeze_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..550874e837cae4da53f290c2eb08638a2cd80be4
--- /dev/null
+++ b/lite/kernels/bm/bridges/squeeze_op.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include <bmcompiler_op_code.h>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int SqueezeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  // input
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
+  }
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  std::vector<int> axes;
+  if (op_info->HasAttr("axes")) {
+    axes = op_info->GetAttr<std::vector<int>>("axes");
+  }
+  auto unique_op_scale_name = lite::subgraph::bm::UniqueName(op_type);
+  add_squeeze_layer(graph->GetCompilerHandle(),
+                    static_cast<const char*>(x_var_name.c_str()),
+                    const_cast<const int*>(&i_x_shape_data[0]),
+                    x_dims.size(),
+                    const_cast<const int*>(&axes[0]),
+                    axes.size(),
+                    static_cast<const char*>(output_var_name.c_str()));
+  graph->AddNode(output_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(squeeze,
+                         kBM,
+                         paddle::lite::subgraph::bm::SqueezeConverter);
+REGISTER_SUBGRAPH_BRIDGE(squeeze2,
+                         kBM,
+                         paddle::lite::subgraph::bm::SqueezeConverter);
diff --git a/lite/kernels/bm/bridges/utility.cc b/lite/kernels/bm/bridges/utility.cc
index aa61462d046e1d21b49517a6362b54a884a6b6de..ffbefa137b9c9caab388fcee865469cea87b83e4 100644
--- a/lite/kernels/bm/bridges/utility.cc
+++ b/lite/kernels/bm/bridges/utility.cc
@@ -33,7 +33,7 @@ std::string UniqueName(const std::string& prefix) {
     counter = ++(it->second);
   }
 
-  return prefix + "_" + std::to_string(counter);
+  return prefix + "_" + paddle::lite::to_string(counter);
 }
 
 bool HasInputArg(const OpInfo* op_info,
diff --git a/lite/kernels/bm/bridges/yolo_box_op.cc b/lite/kernels/bm/bridges/yolo_box_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d23f87a31a034d3e7de83f9e76d88d862c8260b
--- /dev/null
+++ b/lite/kernels/bm/bridges/yolo_box_op.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <bmcompiler_if.h>
+#include <user_bmcpu_common.h>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "lite/kernels/bm/bridges/graph.h"
+#include "lite/kernels/bm/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace bm {
+
+int YoloBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto img_size_var_name = op_info->Input("ImgSize").front();
+  auto img_size = scope->FindVar(img_size_var_name)->GetMutable<lite::Tensor>();
+  auto img_size_dims = img_size->dims();
+  auto boxes_var_name = op_info->Output("Boxes").front();
+  auto boxes = scope->FindVar(boxes_var_name)->GetMutable<lite::Tensor>();
+  auto boxes_dims = boxes->dims();
+  auto scores_var_name = op_info->Output("Scores").front();
+  auto scores = scope->FindVar(scores_var_name)->GetMutable<lite::Tensor>();
+  auto scores_dims = scores->dims();
+  std::vector<int32_t> i_x_shape_data(x_dims.size());
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    i_x_shape_data[i] = static_cast<int32_t>(x_dims[i]);
+  }
+  std::vector<int32_t> i_img_size_shape_data(img_size_dims.size());
+  for (size_t i = 0; i < img_size_dims.size(); i++) {
+    i_img_size_shape_data[i] = static_cast<int32_t>(img_size_dims[i]);
+  }
+  std::vector<int32_t> i_boxes_shape_data(boxes_dims.size());
+  for (size_t i = 0; i < boxes_dims.size(); i++) {
+    i_boxes_shape_data[i] = static_cast<int32_t>(boxes_dims[i]);
+  }
+  std::vector<int32_t> i_scores_shape_data(scores_dims.size());
+  for (size_t i = 0; i < scores_dims.size(); i++) {
+    i_scores_shape_data[i] = static_cast<int32_t>(scores_dims[i]);
+  }
+
+  auto class_num = op_info->GetAttr<int>("class_num");
+  auto downsample_ratio = op_info->GetAttr<int>("downsample_ratio");
+  auto conf_thresh = op_info->GetAttr<float>("conf_thresh");
+  auto anchors = op_info->GetAttr<std::vector<int>>("anchors");
+  int* anchors_buffer = static_cast<int*>(malloc(sizeof(int) * anchors.size()));
+  CHECK(anchors_buffer != nullptr);
+  memcpy(anchors_buffer, &anchors[0], sizeof(int) * anchors.size());
+  user_cpu_param_t bm_param;
+  bm_param.op_type = USER_PADDLE_YOLO_BOX;
+  bm_param.u.yolo_box_param.class_num = class_num;
+  bm_param.u.yolo_box_param.downsample_ratio = downsample_ratio;
+  bm_param.u.yolo_box_param.conf_thresh = conf_thresh;
+  bm_param.u.yolo_box_param.anchors = anchors_buffer;
+  bm_param.u.yolo_box_param.anchors_size = anchors.size();
+  memcpy(anchors_buffer, &anchors[0], sizeof(int) * anchors.size());
+  int32_t input_num = 2;
+  int32_t output_num = 2;
+  int32_t* in_shape[2];
+  int32_t in_dim[2];
+  const char* in_name[2];
+  in_shape[0] = &i_x_shape_data[0];
+  in_shape[1] = &i_img_size_shape_data[0];
+  in_dim[0] = x_dims.size();
+  in_dim[1] = img_size_dims.size();
+  in_name[0] = static_cast<const char*>(x_var_name.c_str());
+  in_name[1] = static_cast<const char*>(img_size_var_name.c_str());
+  int32_t* out_shape[2];
+  int32_t out_dim[2];
+  const char* out_name[2];
+  out_shape[0] = &i_boxes_shape_data[0];
+  out_shape[1] = &i_scores_shape_data[0];
+  out_dim[0] = boxes_dims.size();
+  out_dim[1] = scores_dims.size();
+  out_name[0] = static_cast<const char*>(boxes_var_name.c_str());
+  out_name[1] = static_cast<const char*>(scores_var_name.c_str());
+
+  add_user_cpu_layer(graph->GetCompilerHandle(),
+                     input_num,
+                     in_shape,
+                     in_dim,
+                     in_name,
+                     output_num,
+                     out_shape,
+                     out_dim,
+                     out_name,
+                     &bm_param,
+                     static_cast<int>(sizeof(bm_param)));
+  graph->AddNode(boxes_var_name);
+  graph->AddNode(scores_var_name);
+  return SUCCESS;
+}
+
+}  // namespace bm
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(yolo_box,
+                         kBM,
+                         paddle::lite::subgraph::bm::YoloBoxConverter);
diff --git a/lite/kernels/bm/subgraph_compute.cc b/lite/kernels/bm/subgraph_compute.cc
index 2e47102d767becdea0f0d3d50aa30d6933d6ef8d..338939f019fb8da37d0b0a234e2c8b408e5a9ad0 100644
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
@@ -52,7 +52,7 @@ int SubgraphEngine::BuildDeviceProgram() {
       return subgraph::FAILED;
     }
   }
-  std::string net_name = "paddle_bitmain";
+  std::string net_name = "bmnetc_f32umodel";
   __bmcompile_opt(
       graph.GetCompilerHandle(), const_cast<char*>(net_name.c_str()), 1);
   void* bmodel_data = nullptr;
@@ -71,7 +71,7 @@ int SubgraphEngine::BuildDeviceProgram() {
   origin_itensors_.resize(input_names_.size());
   device_inputs_.resize(input_names_.size());
   for (size_t i = 0; i < input_names_.size(); i++) {
-    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
+    origin_itensors_[i] = scope_->FindMutableTensor(net_info_->input_names[i]);
     CHECK(origin_itensors_[i]);
     origin_idims_[i] = origin_itensors_[i]->dims();
     bm_device_mem_t* p_mem =
@@ -90,19 +90,15 @@ int SubgraphEngine::BuildDeviceProgram() {
   origin_otensors_.resize(output_names_.size());
   device_outputs_.resize(output_names_.size());
   for (size_t i = 0; i < output_names_.size(); i++) {
-    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
+    origin_otensors_[i] = scope_->FindMutableTensor(net_info_->output_names[i]);
     CHECK(origin_otensors_[i]);
     origin_odims_[i] = origin_otensors_[i]->dims();
-    output_map_.insert(std::pair<std::string, int>(output_names_[i], i));
     origin_otensors_[i]->mutable_data<float>();
-  }
-  for (size_t i = 0; i < output_names_.size(); i++) {
-    int mapping_index = output_map_.at(net_info_->output_names[i]);
     bm_device_mem_t* p_mem =
         static_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
     CHECK(p_mem != nullptr);
     CHECK_EQ(bm_malloc_device_byte(
-                 bm_hd_, p_mem, origin_otensors_[mapping_index]->memory_size()),
+                 bm_hd_, p_mem, origin_otensors_[i]->memory_size()),
              BM_SUCCESS);
     bmrt_tensor_with_device(&device_outputs_[i],
                             *p_mem,
diff --git a/lite/kernels/bm/subgraph_compute.h b/lite/kernels/bm/subgraph_compute.h
index 0e4b1dfa32fe79767cc2ec9d42d3b43e862001f3..ed90dc74c4dcc7e50b65d97e52783bccde8e6588 100644
--- a/lite/kernels/bm/subgraph_compute.h
+++ b/lite/kernels/bm/subgraph_compute.h
@@ -51,7 +51,6 @@ class SubgraphEngine : public subgraph::Engine {
   void *bmrt_hd_;
   std::vector<bm_tensor_t> device_inputs_;
   std::vector<bm_tensor_t> device_outputs_;
-  std::map<std::string, int> output_map_;
   const char **net_names_;
   const bm_net_info_t *net_info_;
   bm_handle_t bm_hd_;
diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt
index 2df00f00a4eefd8fc6f9bee5e0c9b76656232041..3fb3136bfc0787f9d8e539039811d25559919f4e 100644
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -1,4 +1,4 @@
-if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_CUDA))
+if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_CUDA))
     return()
 endif()
 
@@ -20,6 +20,7 @@ add_kernel(elementwise_compute_cuda CUDA basic SRCS elementwise_compute.cu DEPS
 add_kernel(calib_compute_cuda CUDA basic SRCS calib_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(layout_compute_cuda CUDA basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} cuda_transpose)
 add_kernel(feed_compute_cuda CUDA basic SRCS feed_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(fetch_compute_cuda CUDA basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(scale_compute_cuda CUDA basic SRCS scale_compute.cc DEPS ${lite_kernel_deps} cuda_scale)
 add_kernel(dropout_compute_cuda CUDA basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} cuda_scale)
 add_kernel(softmax_compute_cuda CUDA basic SRCS softmax_compute.cu DEPS ${lite_kernel_deps})
@@ -41,7 +42,7 @@ add_kernel(search_seq_fc_compute_cuda CUDA extra SRCS search_seq_fc_compute.cu D
 add_kernel(var_conv_2d_compute_cuda CUDA basic SRCS var_conv_2d_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
 
 lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_compute_cuda)
-nv_test(conv2d_cuda_test SRCS conv_compute_test.cc DEPS conv2d_cuda)
+#nv_test(conv2d_cuda_test SRCS conv_compute_test.cc DEPS conv2d_cuda)
 nv_test(nearest_interp_compute_cuda_test SRCS nearest_interp_compute_test.cc DEPS nearest_interp_compute_cuda)
 nv_test(leaky_relu_compute_cuda_test SRCS leaky_relu_compute_test.cc DEPS leaky_relu_compute_cuda)
 nv_test(relu_compute_cuda_test SRCS relu_compute_test.cc DEPS relu_compute_cuda)
@@ -55,17 +56,17 @@ nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_comp
 nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda) 
 nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda )
 nv_test(bilinear_interp_compute_cuda_test SRCS bilinear_interp_compute_test.cc DEPS bilinear_interp_compute_cuda)
-nv_test(pool_compute_cuda_test SRCS pool_compute_test.cc DEPS pool_compute_cuda)
+#nv_test(pool_compute_cuda_test SRCS pool_compute_test.cc DEPS pool_compute_cuda)
 nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc DEPS sequence_reverse_compute_cuda)
-nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda)
-nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda)
+#nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda)
+#nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda)
 nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda)
-nv_test(search_fc_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda sequence_topk_avg_pooling_compute_cuda)
-nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda)
+#nv_test(search_fc_cuda_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda sequence_topk_avg_pooling_compute_cuda)
+#nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda)
 
 if(LITE_BUILD_EXTRA)
     nv_test(search_seq_depadding_compute_cuda_test SRCS search_seq_depadding_compute_test.cc DEPS search_seq_depadding_compute_cuda)
-    nv_test(match_matrix_tensor_compute_cuda_test SRCS match_matrix_tensor_compute_test.cc DEPS match_matrix_tensor_compute_cuda)
+    #nv_test(match_matrix_tensor_compute_cuda_test SRCS match_matrix_tensor_compute_test.cc DEPS match_matrix_tensor_compute_cuda)
     nv_test(search_grnn_compute_cuda_test SRCS search_grnn_compute_test.cc DEPS search_grnn_compute_cuda)
     nv_test(sequence_pool_compute_cuda_test SRCS sequence_pool_compute_test.cc DEPS sequence_pool_compute_cuda)
     nv_test(lookup_table_compute_cuda_test SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_cuda)
diff --git a/lite/kernels/cuda/attention_padding_mask_compute.cu b/lite/kernels/cuda/attention_padding_mask_compute.cu
index fac73b1adc49fd90fbda33669aee53e4126a6649..a18f5a4e740187b5ae64df62abb39c03e0557957 100644
--- a/lite/kernels/cuda/attention_padding_mask_compute.cu
+++ b/lite/kernels/cuda/attention_padding_mask_compute.cu
@@ -63,6 +63,21 @@ __global__ void ker_attention_padding_mask(T* out_data,
   }
 }
 
+template <typename Dtype>
+__global__ void ker_find_begin_data(int count,
+                                    Dtype* out,
+                                    const Dtype* src,
+                                    const Dtype pad_data,
+                                    const int offset_len) {
+  CUDA_KERNEL_LOOP(tid, count) {
+    int index = offset_len - 1;
+    const Dtype* src_data = src + offset_len * tid;
+    for (; index >= 0 && pad_data == src_data[index]; --index) {
+    }
+    out[tid] = index + 1;
+  }
+}
+
 void AttentionPaddingMaskCompute::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->template As<CUDAContext>();
@@ -85,34 +100,16 @@ void AttentionPaddingMaskCompute::Run() {
   auto attn_data = attn->data<float>();
   auto out_data = out->mutable_data<float>(TARGET(kCUDA));
 
-  std::vector<float> src_cpu(src->numel(), 0);
-  TargetWrapperCuda::MemcpyAsync(src_cpu.data(),
-                                 src->data<float>(),
-                                 sizeof(float) * src->numel(),
-                                 IoDirection::DtoH,
-                                 stream);
-  cudaStreamSynchronize(stream);
-
-  std::vector<float> pad_begin(src_seq_num, 0);
-  auto src_len = static_cast<int64_t>(src->lod()[0][1]);
-  int _pad_id = param.pad_id;
-  for (int i = 0; i < src_seq_num; ++i) {
-    const auto* src_data = src_cpu.data() + src_len * i;
-    int index = src_len - 1;
-    for (; index >= 0 && _pad_id == static_cast<int>(src_data[index]);
-         --index) {
-    }
-    pad_begin[i] = static_cast<float>(index + 1);
-  }
-
   param.pad_begin->Resize({static_cast<int64_t>(src_seq_num)});
   auto pad_begin_cuda_data =
       param.pad_begin->mutable_data<float>(TARGET(kCUDA));
-  TargetWrapperCuda::MemcpyAsync(pad_begin_cuda_data,
-                                 pad_begin.data(),
-                                 sizeof(float) * src_seq_num,
-                                 IoDirection::HtoD,
-                                 stream);
+  ker_find_begin_data<
+      float><<<CUDA_GET_BLOCKS(src_seq_num), CUDA_NUM_THREADS, 0, stream>>>(
+      src_seq_num,
+      pad_begin_cuda_data,
+      src->data<float>(),
+      static_cast<float>(param.pad_id),
+      static_cast<int>(src->lod()[0][1]));
 
   std::vector<int> src_offset_cpu(src_offset.size(), 0);
   for (int i = 0; i < src_offset.size(); i++) {
diff --git a/lite/kernels/cuda/fetch_compute.cc b/lite/kernels/cuda/fetch_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..17b3837d3a69aaa0b537e118a11a3df92a1ec9de
--- /dev/null
+++ b/lite/kernels/cuda/fetch_compute.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/fetch_compute.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+void FetchCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  auto* fetch_list = param.fetch_list;
+  if (fetch_list->size() <= static_cast<size_t>(param.col)) {
+    fetch_list->resize(param.col + 1);
+  }
+
+  int num = static_cast<int>(param.input->numel());
+  auto& dst = fetch_list->at(param.col);
+  dst.Resize(param.input->dims());
+  auto output = dst.template mutable_data<T>();
+  TargetW::MemcpyAsync(output,
+                       param.input->template data<T>(),
+                       num * sizeof(T),
+                       IoDirection::DtoH,
+                       stream);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+typedef paddle::lite::kernels::cuda::FetchCompute<float, PRECISION(kFloat)>
+    FetchFp32;
+
+// When the model ends with a cpu kernel, adding cuda's fetch kernel will add
+// useless io_copy, so we just remove register operator.
diff --git a/lite/kernels/cuda/fetch_compute.h b/lite/kernels/cuda/fetch_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..402a33ffc6ee546d9ea1b03d7d2adcacffaa7627
--- /dev/null
+++ b/lite/kernels/cuda/fetch_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+class FetchCompute : public KernelLite<TARGET(kCUDA), Ptype> {
+ public:
+  using param_t = operators::FetchParam;
+  using TargetW = TargetWrapper<TARGET(kCUDA)>;
+
+  void Run() override;
+  virtual ~FetchCompute() = default;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/lookup_table_compute.cu b/lite/kernels/cuda/lookup_table_compute.cu
index 3c3bb952cac01a6d1e296085dc357b9b3a03773a..bfadab1f93c883970bfe2bd865cc5b5d1f6e6f58 100644
--- a/lite/kernels/cuda/lookup_table_compute.cu
+++ b/lite/kernels/cuda/lookup_table_compute.cu
@@ -54,8 +54,8 @@ void LookupTableCompute::Run() {
   auto &param = this->Param<param_t>();
   auto &ctx = this->ctx_->template As<CUDAContext>();
   auto stream = ctx.exec_stream();
-  Tensor *w_t = param.W;
-  Tensor *ids_t = param.Ids;
+  const Tensor *w_t = param.W;
+  const Tensor *ids_t = param.Ids;
   Tensor *out_t = param.Out;
   int64_t padding_idx = param.padding_idx;
 
diff --git a/lite/kernels/cuda/search_group_padding_compute.cu b/lite/kernels/cuda/search_group_padding_compute.cu
index 697e53dbb68b09bec6c32ece73723d469a5cd9d6..e6a7f1e46eb300264744ed74a4608fe4f2926434 100644
--- a/lite/kernels/cuda/search_group_padding_compute.cu
+++ b/lite/kernels/cuda/search_group_padding_compute.cu
@@ -89,7 +89,6 @@ void SearchGroupPaddingCompute::Run() {
   out_new_lod.push_back(in_seq_offset);
   out_new->set_lod(out_new_lod);
   out_new->Resize({x_dims[0], 1});
-  float* out_new_data = out_new->mutable_data<float>(TARGET(kCUDA));
 
   LoD out_padding_lod;
   out_padding_lod.push_back(new_offset);
@@ -111,12 +110,11 @@ void SearchGroupPaddingCompute::Run() {
                                  IoDirection::HtoD,
                                  cuda_stream);
 
-  TargetWrapperCuda::MemsetSync(
-      out_new_data, 0, out_new->dims()[0] * out_new->dims()[1] * sizeof(float));
-  TargetWrapperCuda::MemsetSync(
+  TargetWrapperCuda::MemsetAsync(
       out_padding_data,
       0,
-      out_padding->dims()[0] * out_padding->dims()[1] * sizeof(float));
+      out_padding->dims()[0] * out_padding->dims()[1] * sizeof(float),
+      cuda_stream);
 
   ker_search_group_padding<
       float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, cuda_stream>>>(
diff --git a/lite/kernels/cuda/sequence_pool_compute.cu b/lite/kernels/cuda/sequence_pool_compute.cu
index 97876ec32fcc3ffc3d45ff8dbeafca90d6191b23..2eea2c1aff7e3da0c94d6c1c8f76c9d00e1a67d3 100644
--- a/lite/kernels/cuda/sequence_pool_compute.cu
+++ b/lite/kernels/cuda/sequence_pool_compute.cu
@@ -161,7 +161,6 @@ void SequencePoolCompute::Run() {
   float* out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
   const float* in_data = param.X->data<float>();
 
-  lite::Tensor seq_offset_D;
   seq_offset_D.Resize({static_cast<int64_t>(seq_offset.size())});
   TargetWrapperCuda::MemcpyAsync(
       seq_offset_D.mutable_data<uint64_t>(TARGET(kCUDA)),
diff --git a/lite/kernels/cuda/sequence_pool_compute.h b/lite/kernels/cuda/sequence_pool_compute.h
index 9309454d18d014045ac3bc7f189d2d8430949033..8d976ac6f30c0edf4208ccde2ae303ee796a10f4 100644
--- a/lite/kernels/cuda/sequence_pool_compute.h
+++ b/lite/kernels/cuda/sequence_pool_compute.h
@@ -27,6 +27,9 @@ class SequencePoolCompute
 
   void Run() override;
   virtual ~SequencePoolCompute() = default;
+
+ private:
+  lite::Tensor seq_offset_D;
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt
index f6c3a399490a86e2ac2fcd9cbeb76fca8c8ac479..1f9b84e7db0b98ce45e620cb1840842ba397953e 100755
--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
@@ -1,4 +1,4 @@
-if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_FPGA))
+if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_FPGA))
     return()
 endif()
 
diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
index 428cc213ce63b8d24193a44f23d61fea78f63d6a..f337e518abd071ac262ce9ee47beae1600cc57d1 100644
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -4,6 +4,7 @@ add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_
 add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
 add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps})
 
 #lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
 #lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
diff --git a/lite/kernels/host/crf_decoding_compute.cc b/lite/kernels/host/crf_decoding_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..09bb41de63e4a4ae73a43f450319a439b9fa6820
--- /dev/null
+++ b/lite/kernels/host/crf_decoding_compute.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/crf_decoding_compute.h"
+#include <algorithm>
+#include <cstring>
+#include <map>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+void CrfDecodingCompute::Run() {
+  auto& param = Param<operators::CrfDecodingParam>();
+  auto* emission_weights = param.emission;
+  auto* transition_weights = param.transition;
+  auto* label = param.label;
+  auto* decoded_path = param.viterbi_path;
+
+  int64_t* path = decoded_path->mutable_data<int64_t>();
+  std::fill(path, path + decoded_path->numel(), 0);
+
+  if (param.length != nullptr) {
+    auto* length = param.length;
+    int64_t seq_num = length->numel();
+    const int64_t* length_data = length->data<int64_t>();
+    auto in_dims = emission_weights->dims();
+
+    Tensor emission_weights_tmp = *emission_weights;
+    emission_weights_tmp.Resize({in_dims[0] * in_dims[1], in_dims[2]});
+    decoded_path->Resize({in_dims[0] * in_dims[1], 1});
+    for (int64_t i = 0; i < seq_num; ++i) {
+      if (length_data[i] == 0) continue;
+      int64_t start_pos = i * in_dims[1];
+      int64_t end_pos = start_pos + length_data[i];
+      Tensor decoded_path_one_seq =
+          decoded_path->Slice<int64_t>(start_pos, end_pos);
+      Decode<float>(emission_weights_tmp.Slice<float>(start_pos, end_pos),
+                    *transition_weights,
+                    &decoded_path_one_seq);
+    }
+    if (label != nullptr) {
+      const int64_t* label_value = label->data<int64_t>();
+      for (int64_t i = 0; i < seq_num; ++i) {
+        for (int64_t j = 0; j < in_dims[1]; ++j) {
+          int64_t start_pos = i * in_dims[1];
+          if (j < length_data[i]) {
+            path[start_pos + j] =
+                label_value[start_pos + j] == path[start_pos + j] ? 1 : 0;
+          } else {
+            path[start_pos + j] = 0;
+          }
+        }
+      }
+    }
+  } else {
+    auto lod = emission_weights->lod();
+    CHECK_EQ(lod.size(), 1UL);
+    CHECK_GT(lod.size(), 0);
+    const size_t level = 0;
+    const size_t seq_num = lod[level].size() - 1;
+
+    for (size_t i = 0; i < seq_num; ++i) {
+      if (lod[level][i] == lod[level][i + 1]) continue;
+      int64_t start_pos = static_cast<int64_t>(lod[level][i]);
+      int64_t end_pos = static_cast<int64_t>(lod[level][i + 1]);
+      Tensor decoded_path_one_seq =
+          decoded_path->Slice<int64_t>(start_pos, end_pos);
+      Decode<float>(emission_weights->Slice<float>(start_pos, end_pos),
+                    *transition_weights,
+                    &decoded_path_one_seq);
+    }
+    if (label != nullptr) {
+      auto label_lod = label->lod();
+      CHECK_EQ(label_lod.size(), 1);
+      const int64_t* label_value = label->data<int64_t>();
+      int64_t numel = label->numel();
+      for (int64_t i = 0; i < numel; ++i) {
+        path[i] = label_value[i] == path[i] ? 1 : 0;
+      }
+    }
+  }
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(crf_decoding,
+                     kHost,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::host::CrfDecodingCompute,
+                     def)
+    .BindInput("Emission", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Transition", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Label", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Length", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("ViterbiPath",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/host/crf_decoding_compute.h b/lite/kernels/host/crf_decoding_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd0cb8500099727a5706b016c285f090cbb4842b
--- /dev/null
+++ b/lite/kernels/host/crf_decoding_compute.h
@@ -0,0 +1,95 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <limits>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+template <typename T>
+void Decode(const Tensor& emission_weights,
+            const Tensor& transition_weights,
+            Tensor* decoded_path) {
+  auto emission_dims = emission_weights.dims();
+  const int64_t seq_len = emission_dims[0];
+  const int64_t tag_num = emission_dims[1];
+  const T* x = emission_weights.data<T>();
+  const T* w = transition_weights.data<T>();
+  int64_t* path = decoded_path->mutable_data<int64_t>();
+
+  // alpha is a memo table. An element alpha(k, v) records the score of the
+  // best sequence of tags from position 1 to position k with v being the end
+  // tag.
+  Tensor alpha;
+  alpha.Resize(emission_dims);
+  T* alpha_value = alpha.mutable_data<T>();
+  Tensor track;
+  track.Resize(emission_dims);
+  int* track_value = track.mutable_data<int>();
+
+  const int state_trans_base_idx = 2;
+  for (int i = 0; i < tag_num; ++i) {
+    alpha_value[i] = w[i] + x[i];
+  }
+
+  for (int k = 1; k < seq_len; ++k) {
+    for (int i = 0; i < tag_num; ++i) {
+      T max_score = -std::numeric_limits<T>::max();
+      int max_j = 0;
+      for (size_t j = 0; j < tag_num; ++j) {
+        T score = alpha_value[(k - 1) * tag_num + j] +
+                  w[(j + state_trans_base_idx) * tag_num + i];
+        if (score > max_score) {
+          max_score = score;
+          max_j = j;
+        }
+      }
+      alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
+      track_value[k * tag_num + i] = max_j;
+    }
+  }
+
+  T max_score = -std::numeric_limits<T>::max();
+  int max_i = 0;
+  for (size_t i = 0; i < tag_num; ++i) {
+    T score = alpha_value[(seq_len - 1) * tag_num + i] + w[tag_num + i];
+    if (score > max_score) {
+      max_score = score;
+      max_i = i;
+    }
+  }
+  path[seq_len - 1] = max_i;
+  for (int k = seq_len - 1; k >= 1; --k) {
+    path[k - 1] = max_i = track_value[k * tag_num + max_i];
+  }
+}
+
+class CrfDecodingCompute : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~CrfDecodingCompute() = default;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/host/multiclass_nms_compute.cc b/lite/kernels/host/multiclass_nms_compute.cc
index 9cbc798d46ecb3cf98159e9b4762c8692ec8c1eb..9f4c2fb6f53c17f97cb9a1aaeecd493713899cab 100644
--- a/lite/kernels/host/multiclass_nms_compute.cc
+++ b/lite/kernels/host/multiclass_nms_compute.cc
@@ -429,6 +429,17 @@ REGISTER_LITE_KERNEL(multiclass_nms,
     .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
     .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(multiclass_nms2,
+                     kHost,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::host::MulticlassNmsCompute,
+                     def)
+    .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
     .BindOutput("Index",
                 {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
     .Finalize();
diff --git a/lite/kernels/mlu/CMakeLists.txt b/lite/kernels/mlu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1c41f05ca0cb23013418654f195394f88adf05b1
--- /dev/null
+++ b/lite/kernels/mlu/CMakeLists.txt
@@ -0,0 +1,8 @@
+if(NOT LITE_WITH_MLU)
+    return()
+endif()
+
+add_subdirectory(bridges)
+add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${mlu_subgraph_bridges})
+add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
+add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
diff --git a/lite/kernels/mlu/bridges/CMakeLists.txt b/lite/kernels/mlu/bridges/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..302d580ee1594f983e516d42da6f57221b3b33c8
--- /dev/null
+++ b/lite/kernels/mlu/bridges/CMakeLists.txt
@@ -0,0 +1,41 @@
+if(NOT LITE_WITH_MLU)
+  return()
+endif()
+
+lite_cc_library(subgraph_bridge_utility_mlu SRCS utility.cc DEPS ${mlu_builder_libs} tensor)
+lite_cc_library(subgraph_bridge_tensor_mlu SRCS tensor.cc DEPS ${mlu_builder_libs})
+lite_cc_library(subgraph_bridge_graph_mlu SRCS graph.cc DEPS subgraph_bridge_utility_mlu subgraph_bridge_tensor_mlu)
+
+set(mlu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_mlu subgraph_bridge_graph_mlu)
+
+lite_cc_library(subgraph_bridge_act_op_mlu SRCS act_op.cc DEPS ${mlu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_batch_norm_op_mlu SRCS batch_norm_op.cc DEPS ${mlu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_op_mlu SRCS conv_op.cc DEPS ${mlu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_elementwise_ops_mlu SRCS elementwise_ops.cc DEPS ${mlu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_pool_op_mlu SRCS pool_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_softmax_op_mlu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_fc_op_mlu SRCS fc_op.cc DEPS ${subgraph_bridge_deps_mlu})
+set(mlu_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_utility_mlu
+        subgraph_bridge_graph_mlu
+        subgraph_bridge_act_op_mlu
+        subgraph_bridge_conv_op_mlu
+        subgraph_bridge_elementwise_ops_mlu
+        subgraph_bridge_pool_op_mlu
+        subgraph_bridge_softmax_op_mlu
+        subgraph_bridge_fc_op_mlu
+        subgraph_bridge_batch_norm_op_mlu
+        CACHE INTERNAL "mlu_subgraph_bridges")
+
+
+# lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
+# lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+# lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+# lite_cc_test(test_batch_norm_converter_mlu SRCS batch_norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+# lite_cc_test(test_elementwise_converter_mlu SRCS elementwise_ops_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+# lite_cc_test(test_pool_converter_mlu SRCS pool_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+# lite_cc_test(test_softmax_converter_mlu SRCS softmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+# lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+
+message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}")
diff --git a/lite/kernels/mlu/bridges/act_op.cc b/lite/kernels/mlu/bridges/act_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..50291ec297f9d035f8a7fbe1b525f8ece27bfeb6
--- /dev/null
+++ b/lite/kernels/mlu/bridges/act_op.cc
@@ -0,0 +1,57 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Create act node and set params from op
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
+  CHECK(graph->HasNode(x_var_name));
+  auto input_tensor = graph->GetNode(x_var_name);
+  cnmlActiveFunction_t act_type = OpTypeToCNMLActType(op_type);
+  cnmlBaseOp_t activation_op;
+  CNML_CALL(cnmlCreateActiveOp(&activation_op,
+                               act_type,
+                               input_tensor->mlu_tensor(),
+                               output_tensor->mlu_tensor()));
+  graph->FuseOp(activation_op);
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter);
diff --git a/lite/kernels/mlu/bridges/act_op_test.cc b/lite/kernels/mlu/bridges/act_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..51cdc52dc6da764ab0c2d720b9159fd8b0a2c0df
--- /dev/null
+++ b/lite/kernels/mlu/bridges/act_op_test.cc
@@ -0,0 +1,156 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/operators/activation_ops.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int ActConverter(void* ctx, OpLite* op);
+
+template void FillTensor<float, int>(Tensor* x,
+                                     float lower = -2,
+                                     float upper = -2);
+
+void act_ref(const std::shared_ptr<operators::ActivationOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto x = scope->FindTensor("x");
+  auto out = scope->FindMutableTensor("out");
+  auto out_ref = scope->FindMutableTensor("out_ref");
+  out->Resize(x->dims());
+  out_ref->Resize(x->dims());
+  auto x_data = x->data<float>();
+  auto out_data = out->mutable_data<float>();
+  CHECK_EQ(x->numel(), out->numel());
+
+  // "sigmoid","relu","tanh","relu_clipped","leaky_relu","softsign","hard_sigmoid"
+  if (op_type == "sigmoid") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = 1.f / (1.f + std::exp(-x_data[i]));
+    }
+  } else if (op_type == "relu") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::max(0.f, x_data[i]);
+    }
+  } else if (op_type == "tanh") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = (std::exp(x_data[i]) - std::exp(-x_data[i])) /
+                    (std::exp(x_data[i]) + std::exp(-x_data[i]));
+    }
+  } else if (op_type == "relu_clipped") {
+    auto relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::min(std::max(0.f, x_data[i]), relu_clipped_coef);
+    }
+  } else if (op_type == "relu6") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::min(std::max(0.f, x_data[i]), 6.f);
+    }
+  } else if (op_type == "leaky_relu") {
+    auto alpha = op_info->GetAttr<float>("alpha");
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::max(x_data[i], x_data[i] * alpha);
+    }
+  } else if (op_type == "softsign") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = x_data[i] / (1 + std::abs(x_data[i]));
+    }
+  } else if (op_type == "hard_sigmoid") {
+    auto slope = op_info->GetAttr<float>("slope");
+    auto offset = op_info->GetAttr<float>("offset");
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::min(1.f, slope * x_data[i] + offset);
+      out_data[i] = std::max(0.f, out_data[i]);
+    }
+  } else {
+    LOG(FATAL) << "unsupported activation type: " << op_type;
+  }
+}
+
+void test_act(std::vector<int64_t> x_shape, std::string op_type) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  std::string out_ref_var_name("out_ref");
+  auto* x = scope.NewTensor(x_var_name);
+  auto* out = scope.NewTensor(out_var_name);
+  auto* out_ref = scope.NewTensor(out_ref_var_name);
+  x->Resize(x_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x, 2, 8);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType(op_type);
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  if (op_type == "relu_clipped") {
+    opdesc.SetAttr("Relu_clipped_coef", 3.f);
+  } else if (op_type == "relu6") {
+    opdesc.SetAttr("Relu_clipped_coef", 6.f);
+  } else if (op_type == "leaky_relu") {
+    opdesc.SetAttr("alpha", 0.02f);
+  } else if (op_type == "hard_sigmoid") {
+    opdesc.SetAttr("slope", 0.2f);
+    opdesc.SetAttr("offset", 0.5f);
+  }
+
+  // create and convert op to NPU model, then run it on NPU
+  auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
+  // execute reference implementation and save to output tensor
+  act_ref(op);
+  out_ref->CopyDataFrom(*out);
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(MLUBridges, activation) {
+  std::vector<std::vector<int64_t>> shapes{{1}, {2, 3}, {1, 2, 3, 4}};
+  std::vector<std::string> types{"sigmoid", "relu", "tanh"};
+  for (auto x_shape : shapes) {
+    for (auto op_type : types) {
+      test_act(x_shape, op_type);
+    }
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(MLU, relu, paddle::lite::subgraph::mlu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         sigmoid,
+                         paddle::lite::subgraph::mlu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(MLU, tanh, paddle::lite::subgraph::mlu::ActConverter);
diff --git a/lite/kernels/mlu/bridges/batch_norm_op.cc b/lite/kernels/mlu/bridges/batch_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d95a5115c96c10a8881f50c44fee9881c6a9e218
--- /dev/null
+++ b/lite/kernels/mlu/bridges/batch_norm_op.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Get input vars and op attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto scale_var_name = op_info->Input("Scale").front();
+  auto bias_var_name = op_info->Input("Bias").front();
+  auto mean_var_name = op_info->Input("Mean").front();
+  auto variance_var_name = op_info->Input("Variance").front();
+  auto y_var_name = op_info->Output("Y").front();
+  auto epsilon = op_info->GetAttr<float>("epsilon");
+
+  auto output = scope->FindVar(y_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  auto output_tensor = graph->AddNode(
+      y_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
+
+  CHECK(graph->HasNode(x_var_name));
+
+  auto mean = scope->FindVar(mean_var_name)->GetMutable<Tensor>();
+  auto mean_dims = mean->dims().Vectorize();
+  auto mean_tensor = graph->AddNode(
+      mean_var_name, mean_dims, CNML_CONST, CNML_CNHW, graph->FPType());
+
+  auto variance = scope->FindVar(variance_var_name)->GetMutable<Tensor>();
+  auto variance_dims = variance->dims().Vectorize();
+  auto variance_tensor = graph->AddNode(
+      variance_var_name, variance_dims, CNML_CONST, CNML_CNHW, graph->FPType());
+
+  auto scale = scope->FindVar(scale_var_name)->GetMutable<Tensor>();
+  auto bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
+
+  int co = static_cast<int>(mean_dims[0]);
+
+  for (int i = 0; i < co; ++i) {
+    variance->mutable_data<float>()[i] =
+        scale->data<float>()[i] / sqrtf(variance->data<float>()[i] + epsilon);
+    mean->mutable_data<float>()[i] =
+        mean->data<float>()[i] -
+        bias->data<float>()[i] / variance->data<float>()[i];
+  }
+
+  auto input_tensor = graph->GetNode(x_var_name);
+  cnmlBaseOp_t bn_op;
+  CNML_CALL(cnmlCreateBatchNormOpForward(&bn_op,
+                                         input_tensor->mlu_tensor(),
+                                         output_tensor->mlu_tensor(),
+                                         mean_tensor->mlu_tensor(),
+                                         variance_tensor->mlu_tensor()));
+
+  graph->BindConstData(variance_var_name, variance);
+  graph->BindConstData(mean_var_name, mean);
+  graph->FuseOp(bn_op);
+
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(batch_norm,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::BatchNormConverter);
diff --git a/lite/kernels/mlu/bridges/batch_norm_op_test.cc b/lite/kernels/mlu/bridges/batch_norm_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..47e291bf3d83e8ce85216e86505817be6ed8b106
--- /dev/null
+++ b/lite/kernels/mlu/bridges/batch_norm_op_test.cc
@@ -0,0 +1,186 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/batch_norm_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int BatchNormConverter(void* ctx, OpLite* op);
+
+template <typename dtype>
+void batch_norm_ref(const std::shared_ptr<operators::BatchNormOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto y = scope->FindVar(op_info->Output("Y").front())->GetMutable<Tensor>();
+  auto bias =
+      scope->FindVar(op_info->Input("Bias").front())->GetMutable<Tensor>();
+  auto scale =
+      scope->FindVar(op_info->Input("Scale").front())->GetMutable<Tensor>();
+  auto mean =
+      scope->FindVar(op_info->Input("Mean").front())->GetMutable<Tensor>();
+  auto variance =
+      scope->FindVar(op_info->Input("Variance").front())->GetMutable<Tensor>();
+
+  auto x_data = x->data<dtype>();
+  auto y_data = y->mutable_data<dtype>();
+  auto scale_data = scale->mutable_data<dtype>();
+  auto bias_data = bias->mutable_data<dtype>();
+  auto mean_data = mean->mutable_data<dtype>();
+  auto variance_data = variance->mutable_data<dtype>();
+  DDim x_dims = x->dims();
+
+  float epsilon = op_info->GetAttr<float>("epsilon");
+  // float momentum = op_info->GetAttr<float>("momentum");
+  auto data_layout = op_info->GetAttr<std::string>("data_layout");
+
+  bool global_stats = op_info->GetAttr<bool>("use_global_stats");
+  if (global_stats) {
+    int64_t outer_size = 0;
+    int64_t channel_size = 0;
+    int64_t inner_size = 0;
+    if (data_layout == "NCHW") {
+      outer_size = x_dims[0];
+      channel_size = x_dims[1];
+      inner_size = x_dims.Slice(2, x_dims.size()).production();
+    } else {
+      LOG(FATAL) << "Unknown storage order: " << data_layout;
+    }
+    auto x_ptr = x_data;
+    auto y_ptr = y_data;
+    for (int o = 0; o < outer_size; o++) {
+      for (int c = 0; c < channel_size; c++) {
+        for (int i = 0; i < inner_size; i++) {
+          dtype norm_x =
+              (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon);
+          *y_ptr = norm_x * scale_data[c] + bias_data[c];
+          x_ptr++;
+          y_ptr++;
+        }
+      }
+    }
+  }
+}
+
+void test_batch_norm(
+    int bs, int ic, int ih, int iw, float epsilon, float momentum) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  std::string scale_var_name = "scale";
+  std::string bias_var_name = "bias";
+  std::string mean_var_name = "mean";
+  std::string variance_var_name = "variance";
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* scale = scope.Var(scale_var_name)->GetMutable<Tensor>();
+  auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
+  auto* mean = scope.Var(mean_var_name)->GetMutable<Tensor>();
+  auto* variance = scope.Var(variance_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize({bs, ic, ih, iw});
+  scale->Resize({ic});
+  bias->Resize({ic});
+  mean->Resize({ic});
+  variance->Resize({ic});
+
+  // initialize input&output data
+  FillTensor<float, float>(x, -100, 100);
+  FillTensor<float, float>(scale, -6.7, 13.78);
+  FillTensor<float, float>(bias, -12.11, 12.94);
+  FillTensor<float, float>(mean, -23.45, 67.89);
+  // variance > 0
+  FillTensor<float, float>(variance, 1.5f, 76.78f);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("batch_norm");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetInput("Scale", {scale_var_name});
+  opdesc.SetInput("Bias", {bias_var_name});
+  opdesc.SetInput("Mean", {mean_var_name});
+  opdesc.SetInput("Variance", {variance_var_name});
+  opdesc.SetOutput("Y", {out_var_name});
+  opdesc.SetAttr("is_test", 1);
+  opdesc.SetAttr("use_global_stats", true);
+  opdesc.SetAttr("epsilon", epsilon);
+  opdesc.SetAttr("momentum", momentum);
+  opdesc.SetAttr("data_layout", std::string("NCHW"));
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::BatchNormOp>(opdesc, &scope);
+  // execute reference implementation and save to output tensor
+  batch_norm_ref<float>(op);
+  out_ref->CopyDataFrom(*out);
+
+  Tensor input_trans;
+  input_trans.Resize({bs, ic, ih, iw});
+  transpose(x->mutable_data<float>(),
+            input_trans.mutable_data<float>(),
+            {bs, ic, ih, iw},
+            {0, 2, 3, 1});
+
+  out->Resize({bs, ih, iw, ic});
+  x->CopyDataFrom(input_trans);
+  x->Resize({bs, ih, iw, ic});
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  Tensor output_trans;
+  output_trans.Resize({bs, ic, ih, iw});
+  transpose(out_data,
+            output_trans.mutable_data<float>(),
+            {bs, ih, iw, ic},
+            {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(MLUBridges, batch_norm) {
+  for (auto bs : {1, 4, 7}) {
+    for (auto ic : {1, 4, 7}) {
+      for (auto ih : {1, 4, 7}) {
+        for (auto iw : {1, 4, 7}) {
+          for (auto epsilon : {1e-4f, 1e-5f}) {
+            for (auto momentum : {0.9f, 0.99f}) {
+              test_batch_norm(bs, ic, ih, iw, epsilon, momentum);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         batch_norm,
+                         paddle::lite::subgraph::mlu::BatchNormConverter);
diff --git a/lite/kernels/mlu/bridges/conv_op.cc b/lite/kernels/mlu/bridges/conv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e9fdacdca92398cee9f5e01b3f34e41e672274b5
--- /dev/null
+++ b/lite/kernels/mlu/bridges/conv_op.cc
@@ -0,0 +1,200 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/conv_op.h"
+#include <algorithm>
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto* graph = static_cast<Graph*>(ctx);
+  const auto* op_info = op->op_info();
+  const auto* scope = op->scope();
+  VLOG(3) << "[MLU] Converting " << op_info->Type() << "... ";
+
+  // Get input, filter and op attributes
+  const auto input_var_name = op_info->Input("Input").front();
+  const auto& input_dims_nhwc =
+      scope->FindVar(input_var_name)->GetMutable<Tensor>()->dims();
+  const auto input_dims = DimNHWC2NCHW(input_dims_nhwc);
+  const auto filter_var_name = op_info->Input("Filter").front();
+  auto* filter = scope->FindVar(filter_var_name)->GetMutable<Tensor>();
+  const auto& filter_dims = filter->dims();
+  const auto output_var_name = op_info->Output("Output").front();
+  const auto bs = input_dims[0];
+  const auto oc = filter_dims[0];
+  CHECK_EQ(input_dims.size(), 4);
+  CHECK_EQ(filter_dims.size(), 4);
+  const auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  CHECK_EQ(strides.size(), 2L);
+  CHECK_EQ(dilations.size(), 2L);
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "Paddings size should be the same or twice as the input size.";
+
+  const std::string padding_algorithm =
+      op_info->HasAttr("padding_algorithm")
+          ? op_info->GetAttr<std::string>("padding_algorithm")
+          : "";
+
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+
+  std::vector<int64_t> output_shape({bs, oc});
+  for (size_t i = 0; i < 2; i++) {
+    const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1;
+    output_shape.push_back(
+        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) /
+            strides[i] +
+        1);
+  }
+
+  const auto output_shape_nhwc = DimNCHW2NHWC(output_shape);
+  const auto output_tensor = graph->AddNode(output_var_name,
+                                            output_shape_nhwc,
+                                            CNML_TENSOR,
+                                            CNML_NHWC,
+                                            graph->FPType());
+  scope->FindVar(output_var_name)
+      ->GetMutable<::paddle::lite::Tensor>()
+      ->Resize(output_shape_nhwc);
+
+  // Create filter node
+  const auto filter_tensor = graph->AddNode(filter_var_name,
+                                            filter_dims.Vectorize(),
+                                            CNML_FILTER,
+                                            CNML_NCHW,
+                                            graph->FPType());
+  const auto weight_scale =
+      op_info->GetAttr<std::vector<float>>("weight_scale");
+
+  if (filter->precision() == PrecisionType::kUnk ||
+      filter->precision() == PrecisionType::kInt8) {
+    std::vector<float> filter_dequant(filter->data_size());
+    dequant(filter_dequant.data(),
+            filter->mutable_data<int8_t>(),
+            1,
+            filter_dims[0],
+            filter_dims[1] * filter_dims[2] * filter_dims[3],
+            weight_scale);
+    transpose(filter_dequant.data(),
+              filter->mutable_data<float>(),
+              {static_cast<int>(filter_dims[0]),
+               static_cast<int>(filter_dims[1]),
+               static_cast<int>(filter_dims[2]),
+               static_cast<int>(filter_dims[3])},
+              {0, 2, 3, 1});
+    filter->set_precision(PrecisionType::kFloat);
+  } else if (filter->precision() != PrecisionType::kFloat) {
+    LOG(FATAL) << "UnSupported weight precision!";
+  }
+
+  cnmlConvOpParam_t conv_param;
+  CNML_CALL(cnmlCreateConvOpParam(&conv_param,
+                                  strides[0],
+                                  strides[1],
+                                  dilations[0],
+                                  dilations[1],
+                                  paddings[0] * 2,
+                                  paddings[2] * 2));
+  std::string bias_var_name;
+  std::shared_ptr<MLUTensor> bias_tensor;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    const DDim output_dims(output_shape);
+    bias_var_name = op_info->Input("Bias").front();
+    auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
+    const auto& bias_dims = bias->dims();
+    const auto bias_data_size = bias_dims.production();
+    const auto output_data_size = output_dims.production();
+    std::vector<int64_t> bias_shape;
+    if (bias_data_size == oc) {
+      // 0: {oc}
+      bias_shape = {oc};
+    } else if (bias_data_size == output_data_size / bs) {
+      LOG(FATAL) << "Unsupported ... ...";
+      // 1: {1, oc, oh, ow}
+      bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]};
+    } else if (bias_data_size == output_data_size) {
+      LOG(FATAL) << "Unsupported ... ...";
+      // 2: {n, oc, oh, ow}
+      bias_shape = output_dims.Vectorize();
+    } else {
+      LOG(ERROR) << "[MLU] Bias dimension " << bias_dims
+                 << " isn't supported in conv2d Op when output dimension is "
+                 << output_dims;
+    }
+    bias_tensor = graph->AddNode(bias_var_name,
+                                 bias_dims.Vectorize(),
+                                 CNML_CONST,
+                                 CNML_CNHW,
+                                 graph->FPType());
+    graph->BindConstData(bias_var_name, bias);
+  }
+  cnmlBaseOp_t conv_op;
+  const auto input_scale = op_info->GetAttr<float>("input_scale");
+  CNML_CALL(cnmlCreateConvOpForward(
+      &conv_op,
+      conv_param,
+      graph->GetNode(input_var_name)->mlu_tensor(),
+      output_tensor->mlu_tensor(),
+      filter_tensor->mlu_tensor(),
+      bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
+
+  graph->SetComputingDataType(
+      conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale);
+  graph->SetComputingDataType(
+      conv_op,
+      filter_tensor->mlu_tensor(),
+      1 / *min_element(weight_scale.begin(), weight_scale.end()));
+  CNML_CALL(cnmlSetOperationComputingLayout(conv_op, CNML_NHWC));
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
+    graph->BindConstData(bias_var_name, bias);
+  }
+  graph->BindConstData(filter_var_name, filter);
+  graph->FuseOp(conv_op);
+  CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(conv2d,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ConvConverter);
diff --git a/lite/kernels/mlu/bridges/conv_op_test.cc b/lite/kernels/mlu/bridges/conv_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e8ef9ba04fd6126f00f4ee2ff869495929bfdc9a
--- /dev/null
+++ b/lite/kernels/mlu/bridges/conv_op_test.cc
@@ -0,0 +1,350 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/conv_op.h"
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int ConvConverter(void* ctx, OpLite* op);
+
+void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto input =
+      scope->FindVar(op_info->Input("Input").front())->GetMutable<Tensor>();
+  auto filter =
+      scope->FindVar(op_info->Input("Filter").front())->GetMutable<Tensor>();
+  auto output =
+      scope->FindVar(op_info->Output("Output").front())->GetMutable<Tensor>();
+  std::vector<int32_t> strides =
+      op_info->GetAttr<std::vector<int32_t>>("strides");
+  std::vector<int32_t> paddings =
+      op_info->GetAttr<std::vector<int32_t>>("paddings");
+  int32_t groups = op_info->GetAttr<int32_t>("groups");
+  std::vector<int32_t> dilations =
+      op_info->GetAttr<std::vector<int32_t>>("dilations");
+  bool fuse_relu = op_info->GetAttr<bool>("fuse_relu");
+  auto input_dims = input->dims();
+  auto filter_dims = filter->dims();
+  auto output_dims = output->dims();
+  auto input_data = input->mutable_data<float>();
+  auto filter_data = filter->mutable_data<float>();
+  auto output_data = output->mutable_data<float>();
+  int kernel_w = filter_dims[3];
+  int kernel_h = filter_dims[2];
+  int stride_w = strides[1];
+  int stride_h = strides[0];
+  int dila_w = dilations[1];
+  int dila_h = dilations[0];
+  int pad_w = paddings[2];
+  int pad_h = paddings[0];
+  int batch_size = input_dims[0];
+  int in_ch_size = input_dims[1];
+  int in_h = input_dims[2];
+  int in_w = input_dims[3];
+  int out_ch_size = output_dims[1];
+  int out_h = output_dims[2];
+  int out_w = output_dims[3];
+  int out_c_group = out_ch_size / groups;
+  int in_c_group = in_ch_size / groups;
+  Tensor* bias = nullptr;
+  float* bias_data = nullptr;
+  bool is_channel_bias = false;
+  if (op_info->HasInput("Bias")) {
+    auto bias_var_names = op_info->Input("Bias");
+    if (bias_var_names.size() > 0) {
+      auto bias_var_name = bias_var_names.front();
+      bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+      auto bias_dims = bias->dims();
+      is_channel_bias = bias_dims.production() == out_ch_size;
+      bias_data = bias->mutable_data<float>();
+    }
+  }
+  for (int n = 0; n < batch_size; ++n) {
+    for (int g = 0; g < groups; ++g) {
+      for (int oc = 0; oc < out_c_group; ++oc) {
+        for (int oh = 0; oh < out_h; ++oh) {
+          for (int ow = 0; ow < out_w; ++ow) {
+            int out_idx = n * groups * out_c_group * out_h * out_w +
+                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
+                          oh * out_w + ow;
+            float out_value =
+                bias_data != nullptr
+                    ? (is_channel_bias ? bias_data[g * out_c_group + oc]
+                                       : bias_data[out_idx])
+                    : 0;
+            // + out_value *= beta;
+            for (int ic = 0; ic < in_c_group; ++ic) {
+              for (int kh = 0; kh < kernel_h; ++kh) {
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                  int iw = ow * stride_w - pad_w + kw * (dila_w);
+                  int ih = oh * stride_h - pad_h + kh * (dila_h);
+                  if (iw < 0 || iw >= in_w) continue;
+                  if (ih < 0 || ih >= in_h) continue;
+                  int in_idx = n * in_ch_size * in_h * in_w +
+                               g * in_c_group * in_h * in_w + ic * in_h * in_w +
+                               ih * in_w + iw;
+                  int filter_idx =
+                      g * out_c_group * in_c_group * kernel_h * kernel_w +
+                      oc * in_c_group * kernel_h * kernel_w +
+                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
+                  out_value += input_data[in_idx] * filter_data[filter_idx];
+                }
+              }
+            }
+            if (fuse_relu) {
+              out_value = out_value > 0 ? out_value : 0;
+            }
+            output_data[out_idx] = out_value;
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_conv(int bs,
+               int ic,
+               int oc,
+               int ih,
+               int iw,
+               bool has_bias,
+               bool is_channel_bias,
+               bool fuse_relu,
+               bool depthwise,
+               int dilation,
+               int stride,
+               int padding,
+               int kernel) {
+  // prepare input&output variables
+  Scope scope;
+  std::string input_var_name("input");
+  std::string filter_var_name("filter");
+  std::string filter_int_var_name("filter_int");
+  std::string bias_var_name("bias");
+  std::string output_var_name("output");
+  std::string output_ref_var_name("output_ref");
+  auto* input = scope.Var(input_var_name)->GetMutable<Tensor>();
+  auto* filter = scope.Var(filter_var_name)->GetMutable<Tensor>();
+  auto* filter_int = scope.Var(filter_int_var_name)->GetMutable<Tensor>();
+  auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
+  auto* output = scope.Var(output_var_name)->GetMutable<Tensor>();
+  auto* output_ref = scope.Var(output_ref_var_name)->GetMutable<Tensor>();
+
+  // get group size and input&filter shape
+  int groups = 1;
+  if (depthwise) {  // depthwise convolution ?
+    groups = oc = ic;
+  }
+  std::vector<int64_t> input_shape = {bs, ic, ih, iw};
+  std::vector<int64_t> filter_shape = {oc, ic / groups, kernel, kernel};
+  std::vector<int64_t> output_shape({bs, oc});
+  for (size_t i = 0; i < 2; i++) {
+    const int dkernel = dilation * (kernel - 1) + 1;
+    int output_size = (input_shape[i + 2] + 2 * padding - dkernel) / stride + 1;
+    output_shape.push_back(output_size);
+  }
+  input->Resize(input_shape);
+  filter->Resize(filter_shape);
+  filter_int->Resize(filter_shape);
+  // initialize input&output data
+  FillTensor<int8_t, int8_t>(filter_int, -4, 4);
+  float filter_scale = 1. / 16;
+  float input_scale = 1. / 8;
+
+  Tensor input_int;
+  input_int.Resize(input_shape);
+  FillTensor<int8_t, int8_t>(&input_int, -127, 127);
+  for (int i = 0; i < input->data_size(); i++) {
+    input->mutable_data<float>()[i] = input_int.data<int8_t>()[i] * input_scale;
+  }
+  for (int i = 0; i < filter->data_size(); i++) {
+    filter->mutable_data<float>()[i] =
+        filter_int->data<int8_t>()[i] * filter_scale;
+  }
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType(depthwise ? "depthwise_conv2d" : "conv2d");
+  opdesc.SetInput("Input", {input_var_name});
+  opdesc.SetInput("Filter", {filter_var_name});
+  opdesc.SetOutput("Output", {output_var_name});
+  opdesc.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
+  opdesc.SetAttr("strides", std::vector<int32_t>({stride, stride}));
+  opdesc.SetAttr("paddings",
+                 std::vector<int32_t>({padding, padding, padding, padding}));
+  opdesc.SetAttr("groups", groups);
+  opdesc.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
+  if (has_bias) {
+    if (is_channel_bias) {
+      bias->Resize({oc});
+    } else {
+      bias->Resize({output_shape});
+    }
+    FillTensor<float>(bias);
+    opdesc.SetInput("Bias", {bias_var_name});
+  }
+
+  auto op_cpu = CreateOp<operators::ConvOpLite>(opdesc, &scope);
+  // execute reference implementation and save to output tensor('out')
+  conv_ref(op_cpu);
+  output_ref->CopyDataFrom(*output);
+
+  // initialize op desc
+  cpp::OpDesc opdesc_mlu;
+  opdesc_mlu.SetType(depthwise ? "depthwise_conv2d" : "conv2d");
+  opdesc_mlu.SetInput("Input", {input_var_name});
+  opdesc_mlu.SetInput("Filter", {filter_int_var_name});
+  opdesc_mlu.SetOutput("Output", {output_var_name});
+  opdesc_mlu.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
+  opdesc_mlu.SetAttr("strides", std::vector<int32_t>({stride, stride}));
+  opdesc_mlu.SetAttr(
+      "paddings", std::vector<int32_t>({padding, padding, padding, padding}));
+  opdesc_mlu.SetAttr("groups", groups);
+  opdesc_mlu.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
+
+  opdesc_mlu.SetAttr("weight_scale", std::vector<float>(oc, filter_scale));
+  opdesc_mlu.SetAttr("input_scale", input_scale);
+
+  if (has_bias) {
+    if (is_channel_bias) {
+      bias->Resize({oc});
+    } else {
+      bias->Resize({output_shape});
+    }
+    FillTensor<float>(bias);
+    opdesc_mlu.SetInput("Bias", {bias_var_name});
+  }
+
+  for (int i = 0; i < bs; i++) {
+    for (int j = 0; j < ic; j++) {
+      for (int k = 0; k < ih * iw; k++) {
+        input->mutable_data<float>()[i * ic * ih * iw + k * ic + j] =
+            input_int.data<int8_t>()[i * ic * ih * iw + j * ih * iw + k] *
+            input_scale;
+      }
+    }
+  }
+
+  input->Resize({bs, ih, iw, ic});
+  output->Resize(
+      {output_shape[0], output_shape[2], output_shape[3], output_shape[1]});
+
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::ConvOpLite>(opdesc_mlu, &scope);
+  LaunchOp(op, {input_var_name}, {output_var_name});
+  // compare results
+  auto* output_data = output->mutable_data<float>();
+  auto* output_ref_data = output_ref->mutable_data<float>();
+  Tensor output_trans;
+  output_trans.Resize({output_shape});
+  transpose(output_data,
+            output_trans.mutable_data<float>(),
+            {static_cast<int>(output_shape[0]),
+             static_cast<int>(output_shape[2]),
+             static_cast<int>(output_shape[3]),
+             static_cast<int>(output_shape[1])},
+            {0, 3, 1, 2});
+  output_data = output_trans.mutable_data<float>();
+  for (int i = 0; i < output->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+  }
+}
+
+TEST(MLUBridges, conv) {
+#if 1
+  for (auto bs : {1}) {
+    for (auto ic : {3}) {
+      for (auto oc : {32}) {
+        for (auto ih : {13}) {
+          for (auto iw : {13}) {
+            for (auto has_bias : {false}) {
+              for (auto is_channel_bias : {true}) {
+                for (auto fuse_relu : {false}) {
+                  for (auto depthwise : {false}) {
+                    for (auto dilation : {1}) {
+                      for (auto stride : {1}) {
+                        for (auto kernel : {3}) {
+                          // std::vector<int> paddings = {kernel / 2};
+                          std::vector<int> paddings = {0};
+                          if (kernel / 2 != 0) {
+                            // paddings.push_back(0);
+                          }
+                          for (auto padding : paddings) {
+                            VLOG(3) << "bs: " << bs << " ic: " << ic
+                                    << " oc: " << oc << " ih: " << ih
+                                    << " iw: " << iw
+                                    << " has_bias: " << has_bias
+                                    << " is_channel_bias: " << is_channel_bias
+                                    << " fuse_relu: " << fuse_relu
+                                    << " depthwise: " << depthwise
+                                    << " dilation: " << dilation
+                                    << " stride: " << stride
+                                    << " padding: " << padding
+                                    << " kernel: " << kernel;
+                            test_conv(bs,
+                                      ic,
+                                      oc,
+                                      ih,
+                                      iw,
+                                      has_bias,
+                                      is_channel_bias,
+                                      fuse_relu,
+                                      depthwise,
+                                      dilation,
+                                      stride,
+                                      padding,
+                                      kernel);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+#else
+  test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 1, 3);
+  test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 0, 3);
+  test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 2, 5);
+  test_conv(1, 3, 6, 14, 14, false, false, false, true, 2, 1, 0, 5);
+#endif
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         conv2d,
+                         paddle::lite::subgraph::mlu::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         depthwise_conv2d,
+                         paddle::lite::subgraph::mlu::ConvConverter);
diff --git a/lite/kernels/mlu/bridges/elementwise_ops.cc b/lite/kernels/mlu/bridges/elementwise_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ef949925d20e0a2cb1c7f25d840e2041d79dd7a
--- /dev/null
+++ b/lite/kernels/mlu/bridges/elementwise_ops.cc
@@ -0,0 +1,153 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) {
+  auto x_dims = x.dims();
+  CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x";
+  auto y_dims = y->dims();
+  CHECK_GE(x_dims.size(), y_dims.size());
+
+  if (axis < 0) {
+    axis += x_dims.size();
+  }
+
+  std::vector<int64_t> y_new_shape(y_dims.Vectorize());
+  if (y_new_shape.size() == 4UL) {
+    return y_new_shape;
+  }
+  for (int i = 0; i < axis; i++) {
+    y_new_shape.insert(y_new_shape.begin(), 1);
+  }
+  while (y_new_shape.size() < 4) {
+    y_new_shape.push_back(1);
+  }
+  CHECK_EQ(y_new_shape.size(), 4UL);
+  return y_new_shape;
+}
+
+int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  auto x_var_name = op_info->Input("X").front();
+  auto y_var_name = op_info->Input("Y").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto axis = op_info->GetAttr<int>("axis");
+
+  auto x_tensor = graph->GetNode(x_var_name);
+  auto x = scope->FindTensor(x_var_name);
+  std::shared_ptr<MLUTensor> y_tensor;
+  if (graph->HasNode(y_var_name)) {
+    y_tensor = graph->GetNode(y_var_name);
+  } else {
+    auto y = scope->FindMutableTensor(y_var_name);
+    auto y_new_shape = CvtYShape(*x, y, axis);
+    // all subgraph input tensor are built at first
+    // If we can not find the tensor, it should be const tensor
+    y_tensor = graph->AddNode(
+        y_var_name, y_new_shape, CNML_CONST, CNML_NCHW, graph->FPType());
+    graph->BindConstData(y_var_name, y);
+  }
+
+  auto output_tensor = graph->AddNode(out_var_name,
+                                      x->dims().Vectorize(),
+                                      CNML_TENSOR,
+                                      CNML_NHWC,
+                                      graph->FPType());
+
+  cnmlBaseOp_t elementwise_op;
+  if (op_type == "elementwise_add") {
+    CNML_CALL(cnmlCreateBroadcastAddOp(&elementwise_op,
+                                       x_tensor->mlu_tensor(),
+                                       y_tensor->mlu_tensor(),
+                                       output_tensor->mlu_tensor()));
+  } else if (op_type == "fusion_elementwise_add_activation") {
+    auto mid_tensor = graph->AddNode(out_var_name + "_mid",
+                                     x->dims().Vectorize(),
+                                     CNML_TENSOR,
+                                     CNML_NHWC,
+                                     graph->FPType());
+    CNML_CALL(cnmlCreateBroadcastAddOp(&elementwise_op,
+                                       x_tensor->mlu_tensor(),
+                                       y_tensor->mlu_tensor(),
+                                       mid_tensor->mlu_tensor()));
+  } else if (op_type == "elementwise_sub") {
+    CNML_CALL(cnmlCreateBroadcastSubOp(&elementwise_op,
+                                       x_tensor->mlu_tensor(),
+                                       y_tensor->mlu_tensor(),
+                                       output_tensor->mlu_tensor()));
+  } else if (op_type == "elementwise_mul") {
+    CNML_CALL(cnmlCreateBroadcastMultOp(&elementwise_op,
+                                        x_tensor->mlu_tensor(),
+                                        y_tensor->mlu_tensor(),
+                                        output_tensor->mlu_tensor()));
+  } else if (op_type == "elementwise_div") {
+    CNML_CALL(cnmlCreateRealDivOp(&elementwise_op,
+                                  x_tensor->mlu_tensor(),
+                                  y_tensor->mlu_tensor(),
+                                  output_tensor->mlu_tensor()));
+  } else {
+    LOG(WARNING) << "[MLU] Unsupported op type: " << op_type;
+    return FAILED;
+  }
+
+  graph->FuseOp(elementwise_op);
+  cnmlBaseOp_t act_op;
+  if (op_type == "fusion_elementwise_add_activation") {
+    auto mid_tensor = graph->GetNode(out_var_name + "_mid");
+    auto type_string = op_info->GetAttr<std::string>("act_type");
+    cnmlActiveFunction_t act_type = OpTypeToCNMLActType(type_string);
+    CNML_CALL(cnmlCreateActiveOp(&act_op,
+                                 act_type,
+                                 mid_tensor->mlu_tensor(),
+                                 output_tensor->mlu_tensor()));
+    graph->FuseOp(act_op);
+  }
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_sub,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(elementwise_div,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ElementwiseConverter);
diff --git a/lite/kernels/xpu/bridges/elementwise_ops_test.cc b/lite/kernels/mlu/bridges/elementwise_ops_test.cc
similarity index 59%
rename from lite/kernels/xpu/bridges/elementwise_ops_test.cc
rename to lite/kernels/mlu/bridges/elementwise_ops_test.cc
index 2abda822e3ae380ad376e92db99b5ad204a2a2a4..388aa68600e180945d19e1a4e4728cf26bf801e1 100644
--- a/lite/kernels/xpu/bridges/elementwise_ops_test.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops_test.cc
@@ -16,50 +16,42 @@
 #include <gtest/gtest.h>
 #include <random>
 #include "lite/core/op_registry.h"
-#include "lite/kernels/xpu/bridges/registry.h"
-#include "lite/kernels/xpu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
-namespace xpu {
-namespace bridges {
+namespace subgraph {
+namespace mlu {
+
+int ElementwiseConverter(void* ctx, OpLite* op);
 
 template <typename dtype>
 void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
   Scope* scope = op->scope();
   const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  auto x = scope->FindTensor("x");
+  auto y = scope->FindTensor("y");
+  auto out = scope->FindMutableTensor("out_ref");
+  out->Resize(x->dims());
 
   auto x_data = x->data<dtype>();
   auto y_data = y->data<dtype>();
-  dtype* out_data = out->mutable_data<dtype>();
+  auto out_data = out->mutable_data<dtype>();
 
   auto x_dims = x->dims();
   auto y_dims = y->dims();
   int axis = op_info->GetAttr<int>("axis");
 
   if (axis < 0) {
-    axis = x_dims.size() - y_dims.size();
+    axis += x_dims.size();
   }
   int batch = 1;
-  int channels = 1;
-  int num = 1;
-  for (int i = 0; i < axis; ++i) {
-    batch *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    channels *= y_dims[i];
-  }
-  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-    num *= x_dims[i];
-  }
+  int channels = y->numel();
+  int num = x->numel() / channels / batch;
   // do elementwise add/sub/max...
-  std::string elt_type = "add";
-  if (elt_type == "add") {
+  std::string op_type = op_info->Type();
+  if (op_type == "elementwise_add") {
     for (int i = 0; i < batch; ++i) {
       for (int j = 0; j < channels; ++j) {
         int offset = (i * channels + j) * num;
@@ -73,7 +65,7 @@ void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
         }
       }
     }
-  } else if (elt_type == "sub") {
+  } else if (op_type == "elementwise_sub") {
     for (int i = 0; i < batch; ++i) {
       for (int j = 0; j < channels; ++j) {
         int offset = (i * channels + j) * num;
@@ -87,7 +79,7 @@ void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
         }
       }
     }
-  } else if (elt_type == "mul") {
+  } else if (op_type == "elementwise_mul") {
     for (int i = 0; i < batch; ++i) {
       for (int j = 0; j < channels; ++j) {
         int offset = (i * channels + j) * num;
@@ -101,7 +93,21 @@ void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
         }
       }
     }
-  } else if (elt_type == "max") {
+  } else if (op_type == "elementwise_div") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr / diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else if (op_type == "elementwise_max") {
     for (int i = 0; i < batch; ++i) {
       for (int j = 0; j < channels; ++j) {
         int offset = (i * channels + j) * num;
@@ -116,13 +122,14 @@ void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
       }
     }
   } else {
-    LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
+    LOG(FATAL) << "unsupported Elementwise type: " << op_type;
   }
 }
 
-void test_elementwise_add(std::vector<int64_t> x_dims,
-                          std::vector<int64_t> y_dims,
-                          int axis) {
+void test_elementwise_add(const std::vector<int64_t>& x_shape,
+                          const std::vector<int64_t>& y_shape,
+                          int axis,
+                          std::string elt_type) {
   // prepare input&output variables
   Scope scope;
   std::string x_var_name = "x";
@@ -133,56 +140,59 @@ void test_elementwise_add(std::vector<int64_t> x_dims,
   auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
   auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
   auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize(x_dims);
-  if (y_dims.size() == 0) {
-    y->Resize(x_dims);
-  } else {
-    y->Resize(y_dims);
-  }
+  x->Resize(x_shape);
+  y->Resize(y_shape);
 
   // initialize input&output data
-  FillTensor<float>(x);
-  FillTensor<float>(y);
+  FillTensor<float>(x, 1, 3);
+  FillTensor<float>(y, 1, 3);
 
   // initialize op desc
   cpp::OpDesc opdesc;
-  opdesc.SetType("elementwise_add");
+  opdesc.SetType("elementwise_" + elt_type);
   opdesc.SetInput("X", {x_var_name});
   opdesc.SetInput("Y", {y_var_name});
   opdesc.SetOutput("Out", {out_var_name});
   opdesc.SetAttr("axis", axis);
 
-  // create and convert op to XPU model, then run it on XPU
+  // create and convert op to NPU model, then run it on NPU
   auto op = CreateOp<operators::ElementwiseOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name, y_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
 
   // execute reference implementation and save to output tensor
   elementwise_add_ref<float>(op);
+  out_ref->CopyDataFrom(*out);
 
+  LaunchOp(op, {x_var_name, y_var_name}, {out_var_name});
   // compare results
   auto* out_data = out->mutable_data<float>();
   auto* out_ref_data = out_ref->mutable_data<float>();
   for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
   }
 }
 
-// xpu's bias_add only support y with one dimension
-TEST(XPUBridges, elementwise_add) {
-  test_elementwise_add({1, 2, 3, 4}, {1}, 0);
-  test_elementwise_add({1, 2, 3, 4}, {2}, 1);
-  test_elementwise_add({2, 2, 3, 4}, {3}, 2);
-  test_elementwise_add({2, 2, 3, 4}, {4}, 3);
-  test_elementwise_add({2, 2, 3, 4}, {4}, -1);
-  test_elementwise_add({2, 2, 3, 4}, {}, -1);
+TEST(MLUBridges, elementwise_add) {
+  for (auto elt_type : {"add", "sub", "mul", "div"}) {
+    // test_elementwise_add({1, 2, 3, 4}, {2}, 1, elt_type);
+    // test_elementwise_add({1, 2, 3, 4}, {1, 2, 1, 1}, 1, elt_type);
+    test_elementwise_add({1, 2, 3, 4}, {1, 2, 3, 4}, 3, elt_type);
+  }
 }
 
-}  // namespace bridges
-}  // namespace xpu
-}  // namespace kernels
+}  // namespace mlu
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_OP(elementwise_add);
-USE_XPU_BRIDGE(elementwise_add);
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         elementwise_add,
+                         paddle::lite::subgraph::mlu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         elementwise_sub,
+                         paddle::lite::subgraph::mlu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         elementwise_mul,
+                         paddle::lite::subgraph::mlu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         elementwise_div,
+                         paddle::lite::subgraph::mlu::ElementwiseConverter);
diff --git a/lite/kernels/mlu/bridges/fc_op.cc b/lite/kernels/mlu/bridges/fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..43a75daa2b3d2d6200f3607e213ab62ee6ba3cdb
--- /dev/null
+++ b/lite/kernels/mlu/bridges/fc_op.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  auto x_var_name = op_info->Input("Input").front();
+  auto w_var_name = op_info->Input("W").front();
+  auto output_var_name = op_info->Output("Out").front();
+
+  // int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto w = scope->FindVar(w_var_name)->GetMutable<Tensor>();
+  auto x_dims = x->dims();
+  auto w_dims = w->dims();
+
+  CHECK_GE(x_dims.size(), 2UL);
+  CHECK_EQ(w_dims.size(), 2UL);
+
+  // Create w node
+  std::vector<int64_t> w_shape{w_dims[1], w_dims[0]};
+  auto w_tensor = graph->AddNode(
+      w_var_name, w_shape, CNML_FILTER, CNML_NCHW, graph->FPType());
+
+  auto input_scale = op_info->GetAttr<float>("input_scale");
+
+  std::vector<int64_t> output_shape_nhwc({1, 1, 1, w_dims[1]});
+  auto output_tensor = graph->AddNode(output_var_name,
+                                      output_shape_nhwc,
+                                      CNML_TENSOR,
+                                      CNML_NHWC,
+                                      graph->FPType());
+  scope->FindVar(output_var_name)
+      ->GetMutable<::paddle::lite::Tensor>()
+      ->Resize(output_shape_nhwc);
+
+  std::string bias_var_name;
+  std::shared_ptr<MLUTensor> bias_tensor;
+  // Add bias node if bias tensor exists
+  if (HasInputArg(op_info, scope, "Bias")) {
+    bias_var_name = op_info->Input("Bias").front();
+    auto bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+    auto bias_dims = bias->dims();
+    CHECK(!graph->HasNode(bias_var_name));
+    // CHECK_EQ(bias_dims.production(), n);
+
+    bias_tensor = graph->AddNode(bias_var_name,
+                                 bias_dims.Vectorize(),
+                                 CNML_CONST,
+                                 CNML_CNHW,
+                                 graph->FPType());
+    graph->BindConstData(bias_var_name, bias);
+  }
+  cnmlBaseOp_t fc_op;
+  CNML_CALL(cnmlCreateMlpOp(&fc_op,
+                            graph->GetNode(x_var_name)->mlu_tensor(),
+                            output_tensor->mlu_tensor(),
+                            w_tensor->mlu_tensor(),
+                            bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
+  graph->SetComputingDataType(
+      fc_op, graph->GetNode(x_var_name)->mlu_tensor(), 1 / input_scale);
+  auto weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
+
+  // LOG(INFO) << "W precision " << int(w->precision());
+  if (w->precision() == PrecisionType::kUnk ||
+      w->precision() == PrecisionType::kInt8) {
+    std::vector<float> w_dequant(w->data_size());
+    dequant(w_dequant.data(),
+            w->mutable_data<int8_t>(),
+            1,
+            w_dims[1],
+            w_dims[0],
+            weight_scale);
+    for (int i = 0; i < w_dims[1]; i++) {
+      for (int j = 0; j < w_dims[0]; j++) {
+        w->mutable_data<float>()[i * w_dims[0] + j] =
+            w_dequant[i + j * w_dims[1]];
+      }
+    }
+    w->set_precision(PrecisionType::kFloat);
+  } else if (w->precision() != PrecisionType::kFloat) {
+    LOG(FATAL) << "UnSupported weight precision!";
+  }
+  // graph->BindConstData(w_var_name, w_dequant.data());
+  graph->BindConstData(w_var_name, w);
+
+  graph->SetComputingDataType(
+      fc_op,
+      w_tensor->mlu_tensor(),
+      1 / *min_element(weight_scale.begin(), weight_scale.end()));
+
+  graph->FuseOp(fc_op);
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(fc, kMLU, paddle::lite::subgraph::mlu::FCConverter);
diff --git a/lite/kernels/mlu/bridges/fc_op_test.cc b/lite/kernels/mlu/bridges/fc_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e5cfdb32e7d993f32403dc764462575181f9d4d
--- /dev/null
+++ b/lite/kernels/mlu/bridges/fc_op_test.cc
@@ -0,0 +1,173 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/fc_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int FCConverter(void* ctx, OpLite* op);
+
+void fc_ref(const std::shared_ptr<operators::FcOpLite> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto input =
+      scope->FindVar(op_info->Input("Input").front())->GetMutable<Tensor>();
+  auto w = scope->FindVar(op_info->Input("W").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  int32_t in_num_col_dims = op_info->GetAttr<int32_t>("in_num_col_dims");
+  Tensor* bias = nullptr;
+  float* bias_data = nullptr;
+  if (op_info->HasInput("Bias")) {
+    auto bias_var_names = op_info->Input("Bias");
+    if (bias_var_names.size() > 0) {
+      auto bias_var_name = bias_var_names.front();
+      bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+      bias_data = bias->mutable_data<float>();
+    }
+  }
+  auto input_data = input->data<float>();
+  auto w_data = w->mutable_data<float>();
+  auto out_data = out->mutable_data<float>();
+  auto in_mat_dims = input->dims().Flatten2D(in_num_col_dims);
+  int out_num_classes = w->dims()[1];
+  const int M = in_mat_dims[0];
+  const int K = in_mat_dims[1];
+  const int N = out_num_classes;
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      out_data[m * N + n] = 0;
+      for (int k = 0; k < K; ++k) {
+        out_data[m * N + n] += input_data[m * K + k] * w_data[k * N + n];
+      }
+    }
+  }
+  if (bias_data != nullptr) {
+    for (int m = 0; m < M; ++m) {
+      for (int n = 0; n < N; ++n) {
+        out_data[m * N + n] += bias_data[n];
+      }
+    }
+  }
+}
+
+void test_fc(const std::vector<int64_t>& input_shape,
+             const std::vector<int64_t>& w_shape,
+             int in_num_col_dims,
+             bool has_bias) {
+  CHECK_EQ(w_shape.size(), 2UL);
+
+  Scope scope;
+  std::string input_var_name("Input");
+  std::string w_var_name("W");
+  std::string w_int_var_name("W_int");
+  std::string bias_var_name("Bias");
+  std::string out_var_name("Out");
+  std::string out_ref_var_name("out_ref");
+  auto* input = scope.Var(input_var_name)->GetMutable<Tensor>();
+  auto* w = scope.Var(w_var_name)->GetMutable<Tensor>();
+  auto* w_int = scope.Var(w_int_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  input->Resize(input_shape);
+  w->Resize(w_shape);
+  w_int->Resize(w_shape);
+
+  FillTensor<int8_t, int8_t>(w_int, -127, 127);
+  float w_scale = 1. / 1024;
+  float input_scale = 1. / 8;
+
+  Tensor input_int;
+  input_int.Resize(input_shape);
+  FillTensor<int8_t, int8_t>(&input_int, -127, 127);
+  for (int i = 0; i < input->data_size(); i++) {
+    input->mutable_data<float>()[i] = input_int.data<int8_t>()[i] * input_scale;
+  }
+
+  for (int i = 0; i < w->data_size(); i++) {
+    w->mutable_data<float>()[i] = w_int->data<int8_t>()[i] * w_scale;
+  }
+
+  // create fc op
+  cpp::OpDesc fc_op_desc;
+  fc_op_desc.SetType("fc");
+  fc_op_desc.SetInput("Input", {input_var_name});
+  fc_op_desc.SetInput("W", {w_var_name});
+  fc_op_desc.SetOutput("Out", {out_var_name});
+  fc_op_desc.SetAttr("in_num_col_dims", static_cast<int>(in_num_col_dims));
+  if (has_bias) {
+    auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
+    bias->Resize({w_shape[1]});
+    FillTensor<float, int>(bias);
+    fc_op_desc.SetInput("Bias", {bias_var_name});
+  }
+
+  auto fc_op = CreateOp<operators::FcOpLite>(fc_op_desc, &scope);
+  fc_ref(fc_op);
+  out_ref->CopyDataFrom(*out);
+
+  // create fc imlu op
+  cpp::OpDesc fc_op_desc_mlu;
+  fc_op_desc_mlu.SetType("fc");
+  fc_op_desc_mlu.SetInput("Input", {input_var_name});
+  fc_op_desc_mlu.SetInput("W", {w_int_var_name});
+  fc_op_desc_mlu.SetOutput("Out", {out_var_name});
+  fc_op_desc_mlu.SetAttr("in_num_col_dims", static_cast<int>(in_num_col_dims));
+
+  fc_op_desc_mlu.SetAttr("weight_scale",
+                         std::vector<float>(w_shape[1], w_scale));
+  fc_op_desc_mlu.SetAttr("input_scale", input_scale);
+  if (has_bias) {
+    fc_op_desc_mlu.SetInput("Bias", {bias_var_name});
+  }
+
+  auto fc_op_mlu = CreateOp<operators::FcOpLite>(fc_op_desc_mlu, &scope);
+  input->Resize({static_cast<int>(input_shape[0]),
+                 static_cast<int>(input_shape[2]),
+                 static_cast<int>(input_shape[3]),
+                 static_cast<int>(input_shape[1])});
+  out->Resize({static_cast<int>(input_shape[0]), static_cast<int>(w_shape[1])});
+  LaunchOp(fc_op_mlu, {input_var_name}, {out_var_name});
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+TEST(MLUBridges, fc) {
+  for (bool use_bias : {true, false}) {
+    // test_fc({1, 8, 8, 1}, {64, 4}, 1, use_bias);
+    // test_fc({1, 5, 5, 1}, {25, 7}, 1, use_bias);
+    // test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias);
+    test_fc({1, 1024, 1, 1}, {1024, 32}, 1, use_bias);
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(MLU, fc, paddle::lite::subgraph::mlu::FCConverter);
diff --git a/lite/kernels/mlu/bridges/graph.cc b/lite/kernels/mlu/bridges/graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..27c6ab2597fa6930b14c4c4e34750030608167b6
--- /dev/null
+++ b/lite/kernels/mlu/bridges/graph.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include <utility>
+#include <vector>
+#include "lite/kernels/mlu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+std::shared_ptr<MLUTensor> Graph::AddNode(const std::string& name,
+                                          std::vector<int64_t> shape,
+                                          cnmlTensorType_t tensor_type,
+                                          cnmlDataOrder_t data_order,
+                                          cnmlDataType_t mlu_dtype,
+                                          void* raw_ptr) {
+  CHECK(!HasNode(name));
+  auto node = std::shared_ptr<MLUTensor>(
+      new MLUTensor(shape, tensor_type, data_order, mlu_dtype));
+  node->set_mlu_ptr(raw_ptr);
+  nodes_.insert(std::make_pair(name, node));
+  return node;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/bridges/graph.h b/lite/kernels/mlu/bridges/graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..140900a2dde004281945e50fb1c72d09b58befa1
--- /dev/null
+++ b/lite/kernels/mlu/bridges/graph.h
@@ -0,0 +1,166 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/mlu/bridges/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+// The Context of the converters which used for converting the ops of subgraph
+// to the MLU IR graph
+class Graph {
+ public:
+  Graph() { CNML_CALL(cnmlCreateFusionOp(&fusion_op_)); }
+
+  ~Graph() {
+    CNML_CALL(cnmlDestroyFusionOp(&fusion_op_));
+    for (auto op : ops_) {
+      CNML_CALL(cnmlDestroyBaseOp(&op));
+    }
+  }
+
+  // Data node
+  std::shared_ptr<MLUTensor> AddNode(
+      const std::string& name,
+      std::vector<int64_t> shape,
+      cnmlTensorType_t tensor_type = CNML_TENSOR,
+      cnmlDataOrder_t data_order = CNML_NCHW,
+      cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32,
+      void* raw_ptr = nullptr);
+
+  std::shared_ptr<MLUTensor> GetNode(const std::string& name) {
+    CHECK(HasNode(name)) << "[MLU] Node " << name << " not found.";
+    return nodes_.at(name);
+  }
+
+  bool HasNode(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+
+  void AddInput(std::shared_ptr<MLUTensor> tensor) {
+    inputs_.push_back(tensor->mlu_tensor());
+    input_tensors_.push_back(tensor);
+  }
+
+  void AddOutput(std::shared_ptr<MLUTensor> tensor) {
+    outputs_.push_back(tensor->mlu_tensor());
+    output_tensors_.push_back(tensor);
+  }
+
+  void FuseOp(cnmlBaseOp_t op) { CNML_CALL(cnmlFuseOp(op, fusion_op_)); }
+
+  void Compile(cnmlCoreVersion_t core_version, int core_number) {
+    CNML_CALL(cnmlSetFusionIO(fusion_op_,
+                              inputs_.data(),
+                              inputs_.size(),
+                              outputs_.data(),
+                              outputs_.size()));
+    CNML_CALL(cnmlSetFusionOpCorenum(fusion_op_, core_number));
+    CNML_CALL(cnmlSetFusionOpCoreVersion(fusion_op_, core_version));
+    CNML_CALL(cnmlCompileFusionOp_V2(fusion_op_));
+    for (auto in : input_tensors_) {
+      input_addrs_.push_back(in->mlu_data());
+    }
+    for (auto out : output_tensors_) {
+      output_addrs_.push_back(out->mlu_data());
+    }
+  }
+
+  void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) {
+    CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_,
+                                            input_addrs_.data(),
+                                            input_addrs_.size(),
+                                            output_addrs_.data(),
+                                            output_addrs_.size(),
+                                            &forward_param,
+                                            que));
+    CNRT_CALL(cnrtSyncQueue(que));
+  }
+
+  void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) {
+    const float* data = tensor->data<float>();
+    size_t len = tensor->data_size();
+    if (fp_type_ == CNML_DATA_FLOAT32) {
+      CNML_CALL(cnmlBindConstData_V2(
+          nodes_[tensor_name]->mlu_tensor(),
+          const_cast<void*>(static_cast<const void*>(data)),
+          false));
+    } else if (fp_type_ == CNML_DATA_FLOAT16) {
+      auto* data_fp16 = tensor->mutable_data<::paddle::lite::fluid::float16>();
+      for (size_t i = 0; i < len; ++i) {
+        data_fp16[i] = static_cast<::paddle::lite::fluid::float16>(data[i]);
+      }
+      CNML_CALL(cnmlBindConstData_V2(nodes_[tensor_name]->mlu_tensor(),
+                                     static_cast<void*>(data_fp16),
+                                     false));
+    } else {
+      CHECK(0);
+    }
+  }
+
+  void SetComputingDataType(cnmlBaseOp_t op,
+                            cnmlTensor_t tensor,
+                            float scale,
+                            cnmlDataType_t data_type = CNML_DATA_INT8) {
+    cnmlQuantizedParam_t quant_param;
+    CNML_CALL(
+        cnmlCreateQuantizedParam(&quant_param, scale2position(scale), 1, 0.0));
+    CNML_CALL(
+        cnmlSetOperationComputingDataType(op, tensor, data_type, quant_param));
+    CNML_CALL(cnmlDestroyQuantizedParam(&quant_param));
+  }
+
+  void SetFPType(::paddle::lite_api::PrecisionType type) {
+    switch (type) {
+      case ::paddle::lite_api::PrecisionType::kFP16:
+        fp_type_ = CNML_DATA_FLOAT16;
+        break;
+      case ::paddle::lite_api::PrecisionType::kFloat:
+        fp_type_ = CNML_DATA_FLOAT32;
+        break;
+      default:
+        CHECK(0);
+    }
+  }
+
+  cnmlDataType_t FPType() { return fp_type_; }
+
+ private:
+  cnmlDataType_t fp_type_{CNML_DATA_FLOAT32};
+  std::unordered_map<std::string, std::shared_ptr<MLUTensor>> nodes_;
+  std::vector<cnmlTensor_t> inputs_;
+  std::vector<cnmlTensor_t> outputs_;
+  std::vector<void*> input_addrs_;
+  std::vector<void*> output_addrs_;
+  std::vector<std::shared_ptr<MLUTensor>> input_tensors_;
+  std::vector<std::shared_ptr<MLUTensor>> output_tensors_;
+  std::vector<cnmlBaseOp_t> ops_;
+  cnmlFusionOp_t fusion_op_;
+};
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/bridges/paddle_use_bridges.h b/lite/kernels/mlu/bridges/paddle_use_bridges.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b12970afadd4e3bdcd7568c05bc15583ccbaaae
--- /dev/null
+++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+USE_SUBGRAPH_BRIDGE(relu, kMLU);
+USE_SUBGRAPH_BRIDGE(conv2d, kMLU);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kMLU);
+USE_SUBGRAPH_BRIDGE(elementwise_add, kMLU);
+USE_SUBGRAPH_BRIDGE(pool2d, kMLU);
+USE_SUBGRAPH_BRIDGE(softmax, kMLU);
+USE_SUBGRAPH_BRIDGE(batch_norm, kMLU);
+USE_SUBGRAPH_BRIDGE(fc, kMLU);
diff --git a/lite/kernels/mlu/bridges/pool_op.cc b/lite/kernels/mlu/bridges/pool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3119b6c77dca10641c7c7c32072969fedb1ecef6
--- /dev/null
+++ b/lite/kernels/mlu/bridges/pool_op.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/pool_op.h"
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+inline cnmlPoolMode_t ToCnmlPoolMode(const std::string& pool_mode) {
+  cnmlPoolMode_t cnml_pool_mode;
+  if (pool_mode == "max") {
+    cnml_pool_mode = CNML_POOL_MAX;
+  } else if (pool_mode == "avg") {
+    cnml_pool_mode = CNML_POOL_AVG;
+  } else {
+    CHECK(false) << "Unexpected pool mode " << pool_mode;
+  }
+
+  return cnml_pool_mode;
+}
+
+int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Get input, and attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindTensor(x_var_name);
+  auto input_dims_nhwc = x->dims();
+  const auto input_dims = DimNHWC2NCHW(input_dims_nhwc);
+  auto output_var_name = op_info->Output("Out").front();
+  auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
+  auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto global_pooling = op_info->GetAttr<bool>("global_pooling");
+  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < 2L; ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  int pad_height = paddings[0];
+  int pad_width = paddings[2];
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  bool adaptive = false;
+  if (op_info->HasAttr("adaptive")) {
+    adaptive = op_info->GetAttr<bool>("adaptive");
+  }
+  lite::operators::UpdatePadding(&paddings,
+                                 global_pooling,
+                                 adaptive,
+                                 padding_algorithm,
+                                 x->dims(),
+                                 strides,
+                                 ksize);
+
+  std::vector<int64_t> output_shape({input_dims[0], input_dims[1]});
+  for (size_t i = 0; i < 2; i++) {
+    output_shape.push_back(
+        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - ksize[0]) /
+            strides[i] +
+        1);
+  }
+
+  auto output_shape_nhwc = DimNCHW2NHWC(output_shape);
+  auto output_tensor = graph->AddNode(output_var_name,
+                                      output_shape_nhwc,
+                                      CNML_TENSOR,
+                                      CNML_NHWC,
+                                      graph->FPType());
+  scope->FindVar(output_var_name)
+      ->GetMutable<::paddle::lite::Tensor>()
+      ->Resize(output_shape_nhwc);
+
+  cnmlPoolOpParam_t pool_param;
+  CNML_CALL(
+      cnmlCreatePoolOpParam_V2(&pool_param,
+                               ksize[0],
+                               ksize[1],
+                               strides[0],
+                               strides[1],
+                               pad_height,
+                               pad_width,
+                               1,  // dilation
+                               1,
+                               ToCnmlPoolMode(pooling_type),
+                               ceil_mode ? CNML_POOL_KVALID : CNML_POOL_KFULL,
+                               true, /* real */
+                               1 /* blend factor */));
+  cnmlBaseOp_t pool_op;
+  CNML_CALL(cnmlCreatePoolOp(&pool_op,
+                             pool_param,
+                             graph->GetNode(x_var_name)->mlu_tensor(),
+                             output_tensor->mlu_tensor()));
+  CNML_CALL(cnmlDestroyPoolOpParam(&pool_param));
+  graph->FuseOp(pool_op);
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(pool2d,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::PoolConverter);
diff --git a/lite/kernels/xpu/bridges/pool_op_test.cc b/lite/kernels/mlu/bridges/pool_op_test.cc
similarity index 66%
rename from lite/kernels/xpu/bridges/pool_op_test.cc
rename to lite/kernels/mlu/bridges/pool_op_test.cc
index 7efc6b464c00c945c71c8c5689e18823cde10f97..29ef68781f4a99ebcc20901dabab6ee22a258424 100644
--- a/lite/kernels/xpu/bridges/pool_op_test.cc
+++ b/lite/kernels/mlu/bridges/pool_op_test.cc
@@ -14,15 +14,17 @@
 
 #include "lite/operators/pool_op.h"
 #include <gtest/gtest.h>
+#include <random>
 #include "lite/core/op_registry.h"
-#include "lite/kernels/xpu/bridges/registry.h"
-#include "lite/kernels/xpu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
-namespace xpu {
-namespace bridges {
+namespace subgraph {
+namespace mlu {
+
+int PoolConverter(void* ctx, OpLite* op);
 
 void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
   Scope* scope = op->scope();
@@ -161,108 +163,118 @@ void test_pool(int bs,
   opdesc.SetAttr("ksize", std::vector<int>({ksize, ksize}));
   opdesc.SetAttr("global_pooling", global_pooling);
   opdesc.SetAttr("exclusive", exclusive);
+  opdesc.SetAttr("ceil_mode", ceil_mode);
   opdesc.SetAttr("strides", std::vector<int>({stride, stride}));
   opdesc.SetAttr("paddings",
                  std::vector<int>({padding, padding, padding, padding}));
-  opdesc.SetAttr("ceil_mode", ceil_mode);
 
-  // create and convert op to XPU model, then run it on XPU
+  // create and convert op to MLU model, then run it on MLU
   auto op = CreateOp<operators::PoolOpLite>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
   // execute reference implementation and save to output tensor
   pool_ref(op);
+  out_ref->CopyDataFrom(*out);
+
+  Tensor input_trans;
+  input_trans.Resize({bs, ic, ih, iw});
+  transpose(x->mutable_data<float>(),
+            input_trans.mutable_data<float>(),
+            {bs, ic, ih, iw},
+            {0, 2, 3, 1});
+
+  auto os = out->dims();
+  out->Resize({static_cast<int>(os[0]),
+               static_cast<int>(os[2]),
+               static_cast<int>(os[3]),
+               static_cast<int>(os[1])});
+  x->CopyDataFrom(input_trans);
+  x->Resize({bs, ih, iw, ic});
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
 
   // compare results
   auto* out_data = out->mutable_data<float>();
   auto* out_ref_data = out_ref->mutable_data<float>();
+  Tensor output_trans;
+  output_trans.Resize(out->dims());
+  transpose(out_data,
+            output_trans.mutable_data<float>(),
+            {static_cast<int>(os[0]),
+             static_cast<int>(os[2]),
+             static_cast<int>(os[3]),
+             static_cast<int>(os[1])},
+            {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<float>();
   for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
   }
 }
 
-TEST(XPUBridges, pool) {
-  for (auto pooling_type : {"max", "avg"}) {
-    for (auto bs : {1, 3}) {
-      for (auto ic : {2}) {
-        for (auto ih : {3}) {
-          for (auto iw : {4}) {
-            test_pool(bs, ic, ih, iw, pooling_type, true, true, true, 0, 1, 0);
-          }
-        }
-      }
-    }
-  }
-
-  for (auto pooling_type : {"max"}) {
-    for (auto ceil_mode : {true, false}) {
-      for (auto ksize : {2, 3}) {
-        for (auto stride : {1, 2}) {
-          for (auto padding : {0, 1}) {
-            for (auto bs : {1, 3}) {
-              for (auto ic : {2}) {
-                for (auto ih : {3}) {
-                  for (auto iw : {4}) {
-                    test_pool(bs,
-                              ic,
-                              ih,
-                              iw,
-                              pooling_type,
-                              ceil_mode,
-                              false,
-                              true,
-                              ksize,
-                              stride,
-                              padding);
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
+TEST(MLUBridges, pool) {
+  // for (auto pooling_type : {"max", "avg"}) {
+  //   for (auto ceil_mode : {true, false}) {
+  //     for (auto global_pooling : {/*true, */ false}) {
+  //       for (auto exclusive : {true /*, false*/}) {
+  //         for (auto ksize : {2, 3}) {
+  //           for (auto stride : {1, 2}) {
+  //             for (auto padding : {0, 1}) {
+  //               for (auto bs : {1, 3}) {
+  //                 for (auto ic : {1, 3}) {
+  //                   for (auto ih : {3, 7}) {
+  //                     for (auto iw : {3, 7}) {
+  //                       test_pool(bs,
+  //                                 ic,
+  //                                 ih,
+  //                                 iw,
+  //                                 pooling_type,
+  //                                 ceil_mode,
+  //                                 global_pooling,
+  //                                 exclusive,
+  //                                 ksize,
+  //                                 stride,
+  //                                 padding);
+  //                     }
+  //                   }
+  //                 }
+  //               }
+  //             }
+  //           }
+  //         }
+  //       }
+  //     }
+  //   }
+  // }
 
-  for (auto pooling_type : {"avg"}) {
+  for (auto pooling_type : {"max", "avg"}) {
     for (auto ceil_mode : {true, false}) {
-      for (auto exclusive : {true, false}) {
-        for (auto ksize : {2, 3}) {
-          for (auto stride : {1, 2}) {
-            for (auto padding : {0, 1}) {
-              for (auto bs : {1, 3}) {
-                for (auto ic : {2}) {
-                  for (auto ih : {3}) {
-                    for (auto iw : {4}) {
-                      test_pool(bs,
-                                ic,
-                                ih,
-                                iw,
-                                pooling_type,
-                                ceil_mode,
-                                false,
-                                exclusive,
-                                ksize,
-                                stride,
-                                padding);
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
+      bool global_pooling = false;
+      bool exclusive = true;
+      int ksize = 2;
+      int stride = 1;
+      int padding = 0;
+      int bs = 6;
+      int ic = 6;
+      int ih = 6;
+      int iw = 6;
+      test_pool(bs,
+                ic,
+                ih,
+                iw,
+                pooling_type,
+                ceil_mode,
+                global_pooling,
+                exclusive,
+                ksize,
+                stride,
+                padding);
     }
   }
 }
 
-}  // namespace bridges
-}  // namespace xpu
-}  // namespace kernels
+}  // namespace mlu
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_OP(pool2d);
-USE_XPU_BRIDGE(pool2d);
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         pool2d,
+                         paddle::lite::subgraph::mlu::PoolConverter);
diff --git a/lite/kernels/mlu/bridges/softmax_op.cc b/lite/kernels/mlu/bridges/softmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b9e2b1116dc95ec276f8d85a5669cec45d98ea39
--- /dev/null
+++ b/lite/kernels/mlu/bridges/softmax_op.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Get op's attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+
+  // nchw axis to nhwc aixs
+  int nchw_to_nhwc_aixs_map[4] = {0, 3, 1, 2};
+  int axis = 1;
+  if (op_info->HasAttr("axis")) {
+    axis = op_info->GetAttr<int>("axis");
+    if (axis < 0) {
+      axis = output_dims.size() + axis;
+    }
+  }
+
+  int nhwc_axis = nchw_to_nhwc_aixs_map[axis];
+
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NHWC, graph->FPType());
+  cnmlBaseOp_t softmax_op;
+  CNML_CALL(cnmlCreateNdSoftmaxOp(&softmax_op,
+                                  nhwc_axis,
+                                  graph->GetNode(x_var_name)->mlu_tensor(),
+                                  output_tensor->mlu_tensor()));
+  graph->FuseOp(softmax_op);
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(softmax,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::SoftmaxConverter);
diff --git a/lite/kernels/mlu/bridges/softmax_op_test.cc b/lite/kernels/mlu/bridges/softmax_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7ceb050d8008f8186fdd737c394d8fe8dc0ffd7f
--- /dev/null
+++ b/lite/kernels/mlu/bridges/softmax_op_test.cc
@@ -0,0 +1,176 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/softmax_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int SoftmaxConverter(void* ctx, OpLite* op);
+
+template <typename dtype>
+void softmax_ref(const std::shared_ptr<operators::SoftmaxOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  auto x_data = x->data<dtype>();
+  auto out_data = out->mutable_data<dtype>();
+  DDim x_dims = x->dims();
+
+  auto x_rank = x_dims.size();
+  int axis = op_info->GetAttr<int>("axis");
+  if (axis < 0) {
+    axis += x_rank;
+  }
+  int axis_size = x_dims[axis];
+  int outer_num = x_dims.Slice(0, axis).production();
+  int inner_num = x_dims.Slice(axis + 1, x_rank).production();
+  int compute_size = outer_num * inner_num;
+  for (int i = 0; i < compute_size; i++) {
+    int idx_inner = i % inner_num;
+    int idx_outer = (i / inner_num) * axis_size;
+    int start = idx_outer * inner_num + idx_inner;
+    int offset;
+
+    offset = start;
+    dtype max_data = std::numeric_limits<dtype>::lowest();
+    for (int j = 0; j < axis_size; j++) {
+      max_data = x_data[offset] > max_data ? x_data[offset] : max_data;
+      offset += inner_num;
+    }
+
+    offset = start;
+    dtype sum_data = (dtype)0;
+    for (int j = 0; j < axis_size; j++) {
+      out_data[offset] = exp(x_data[offset] - max_data);
+      sum_data += out_data[offset];
+      offset += inner_num;
+    }
+
+    offset = start;
+    for (int j = 0; j < axis_size; j++) {
+      out_data[offset] /= sum_data;
+      offset += inner_num;
+    }
+  }
+}
+
+void test_softmax(const std::vector<int64_t>& input_shape, int axis) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize(input_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("softmax");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axis", axis);
+
+  // create and convert op to NPU model, then run it on NPU
+  auto op = CreateOp<operators::SoftmaxOp>(opdesc, &scope);
+  // execute reference implementation and save to output tensor
+  softmax_ref<float>(op);
+  out_ref->CopyDataFrom(*out);
+
+  int bs = x->dims()[0];
+  int ic = x->dims()[1];
+  int ih = x->dims()[2];
+  int iw = x->dims()[3];
+  Tensor input_trans;
+  input_trans.Resize({bs, ic, ih, iw});
+  transpose(x->mutable_data<float>(),
+            input_trans.mutable_data<float>(),
+            {bs, ic, ih, iw},
+            {0, 2, 3, 1});
+
+  out->Resize({bs, ih, iw, ic});
+  x->CopyDataFrom(input_trans);
+  x->Resize({bs, ih, iw, ic});
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  Tensor output_trans;
+  output_trans.Resize({bs, ic, ih, iw});
+  transpose(out_data,
+            output_trans.mutable_data<float>(),
+            {bs, ih, iw, ic},
+            {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(MLUBridges, softmax) {
+  // test_softmax({1, 4}, -1);
+  // // Bug exists in HiAI DDK when the number of items > 16500
+  // test_softmax({1, 16500}, -1);
+  // test_softmax({1, 4}, 0);
+  // test_softmax({1, 4}, 1);
+  // test_softmax({3, 4}, -1);
+  // test_softmax({3, 4}, 0);
+  // test_softmax({3, 4}, 1);
+  // test_softmax({1, 4, 7}, -1);
+  // test_softmax({1, 4, 7}, 0);
+  // // Bug exists in HiAI DDK when axis is 1 and iw > 1
+  // // test_softmax({1, 4, 7}, 1);
+  // test_softmax({1, 4, 1}, 1);
+  // test_softmax({1, 4, 7}, 2);
+  // test_softmax({3, 4, 7}, -1);
+  // test_softmax({3, 4, 7}, 0);
+  // test_softmax({3, 4, 1}, 1);
+  // test_softmax({3, 4, 7}, 2);
+  test_softmax({1, 4, 7, 9}, -1);
+  test_softmax({1, 4, 7, 9}, 0);
+  test_softmax({1, 4, 7, 9}, 1);
+  // Bug exists in HiAI DDK when axis is 2 and iw > 1
+  // test_softmax({1, 4, 7, 9}, 2);
+  test_softmax({1, 4, 7, 1}, 2);
+  test_softmax({1, 4, 7, 9}, 3);
+  test_softmax({3, 4, 7, 9}, -1);
+  test_softmax({3, 4, 7, 9}, 0);
+  test_softmax({3, 4, 7, 9}, 1);
+  test_softmax({3, 4, 7, 1}, 2);
+  test_softmax({3, 4, 7, 9}, 3);
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(MLU,
+                         softmax,
+                         paddle::lite::subgraph::mlu::SoftmaxConverter);
diff --git a/lite/kernels/mlu/bridges/tensor.cc b/lite/kernels/mlu/bridges/tensor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..be7e1f09beaee61dace598b958ab4f95f14b38f8
--- /dev/null
+++ b/lite/kernels/mlu/bridges/tensor.cc
@@ -0,0 +1,271 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/tensor.h"
+#include <glog/logging.h>
+#include <algorithm>
+#include <climits>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+MLUTensor::MLUTensor(const std::vector<int64_t>& shape,
+                     cnmlTensorType_t tensor_type,
+                     cnmlDataOrder_t data_order,
+                     cnmlDataType_t mlu_dtype)
+    : mlu_tensor_(nullptr), tensor_type_(tensor_type), mlu_ptr_(nullptr) {
+  std::vector<int> int_shape;
+  for (auto i : shape) {
+    if (i <= INT_MAX) {
+      int_shape.push_back(i);
+    } else {
+      LOG(FATAL) << "Shape size is beyond the limitation of MLUTensor!";
+    }
+  }
+  remember(int_shape, tensor_type, mlu_dtype, data_order);
+}
+
+void MLUTensor::remember(const std::vector<int>& shape,
+                         cnmlTensorType_t tensor_type,
+                         cnmlDataType_t mlu_dtype,
+                         cnmlDataOrder_t shape_order) {
+  tensor_type_ = tensor_type;
+  mlu_dtype_ = mlu_dtype;
+
+  int size = 4;
+  if (shape.size() > 4 || shape_order == CNML_ARRAY) {
+    size = shape.size();
+  }
+  shape_.resize(size);
+  if (shape.size() <= 4) {
+    switch (shape_order) {
+      case CNML_NCHW:
+        shape_[0] = shape.size() > 0 ? shape[0] : 1;
+        shape_[3] = shape.size() > 1 ? shape[1] : 1;
+        shape_[1] = shape.size() > 2 ? shape[2] : 1;
+        shape_[2] = shape.size() > 3 ? shape[3] : 1;
+        break;
+      case CNML_NCWH:
+        shape_[0] = shape.size() > 0 ? shape[0] : 1;
+        shape_[3] = shape.size() > 1 ? shape[1] : 1;
+        shape_[2] = shape.size() > 3 ? shape[3] : 1;
+        shape_[1] = shape.size() > 2 ? shape[2] : 1;
+        break;
+      case CNML_NHWC:
+        shape_[0] = shape.size() > 0 ? shape[0] : 1;
+        shape_[3] = shape.size() > 3 ? shape[3] : 1;
+        shape_[1] = shape.size() > 1 ? shape[1] : 1;
+        shape_[2] = shape.size() > 2 ? shape[2] : 1;
+        break;
+      case CNML_NHCW:
+        shape_[0] = shape.size() > 0 ? shape[0] : 1;
+        shape_[3] = shape.size() > 2 ? shape[2] : 1;
+        shape_[1] = shape.size() > 1 ? shape[1] : 1;
+        shape_[2] = shape.size() > 3 ? shape[3] : 1;
+        break;
+      case CNML_NWCH:
+        shape_[0] = shape.size() > 0 ? shape[0] : 1;
+        shape_[3] = shape.size() > 2 ? shape[2] : 1;
+        shape_[1] = shape.size() > 3 ? shape[3] : 1;
+        shape_[2] = shape.size() > 1 ? shape[1] : 1;
+        break;
+      case CNML_NWHC:
+        shape_[0] = shape.size() > 0 ? shape[0] : 1;
+        shape_[3] = shape.size() > 3 ? shape[3] : 1;
+        shape_[1] = shape.size() > 2 ? shape[2] : 1;
+        shape_[2] = shape.size() > 1 ? shape[1] : 1;
+        break;
+      case CNML_CNHW:
+        shape_[0] = shape.size() > 1 ? shape[1] : 1;
+        shape_[3] = shape.size() > 0 ? shape[0] : 1;
+        shape_[1] = shape.size() > 2 ? shape[2] : 1;
+        shape_[2] = shape.size() > 3 ? shape[3] : 1;
+        break;
+      case CNML_CNWH:
+        shape_[0] = shape.size() > 1 ? shape[1] : 1;
+        shape_[3] = shape.size() > 0 ? shape[0] : 1;
+        shape_[1] = shape.size() > 3 ? shape[3] : 1;
+        shape_[2] = shape.size() > 2 ? shape[2] : 1;
+        break;
+      case CNML_CHWN:
+        shape_[0] = shape.size() > 3 ? shape[3] : 1;
+        shape_[3] = shape.size() > 0 ? shape[0] : 1;
+        shape_[1] = shape.size() > 1 ? shape[1] : 1;
+        shape_[2] = shape.size() > 2 ? shape[2] : 1;
+        break;
+      case CNML_CHNW:
+        shape_[0] = shape.size() > 2 ? shape[2] : 1;
+        shape_[3] = shape.size() > 0 ? shape[0] : 1;
+        shape_[1] = shape.size() > 1 ? shape[1] : 1;
+        shape_[2] = shape.size() > 3 ? shape[3] : 1;
+        break;
+      case CNML_CWNH:
+        shape_[0] = shape.size() > 2 ? shape[2] : 1;
+        shape_[3] = shape.size() > 0 ? shape[0] : 1;
+        shape_[1] = shape.size() > 3 ? shape[3] : 1;
+        shape_[2] = shape.size() > 1 ? shape[1] : 1;
+        break;
+      case CNML_CWHN:
+        shape_[0] = shape.size() > 3 ? shape[3] : 1;
+        shape_[3] = shape.size() > 0 ? shape[0] : 1;
+        shape_[1] = shape.size() > 2 ? shape[2] : 1;
+        shape_[2] = shape.size() > 1 ? shape[1] : 1;
+        break;
+      case CNML_HNCW:
+        shape_[0] = shape.size() > 1 ? shape[1] : 1;
+        shape_[3] = shape.size() > 2 ? shape[2] : 1;
+        shape_[1] = shape.size() > 0 ? shape[0] : 1;
+        shape_[2] = shape.size() > 3 ? shape[3] : 1;
+        break;
+      case CNML_HNWC:
+        shape_[0] = shape.size() > 1 ? shape[1] : 1;
+        shape_[3] = shape.size() > 3 ? shape[3] : 1;
+        shape_[1] = shape.size() > 0 ? shape[0] : 1;
+        shape_[2] = shape.size() > 2 ? shape[2] : 1;
+        break;
+      case CNML_HCWN:
+        shape_[0] = shape.size() > 3 ? shape[3] : 1;
+        shape_[3] = shape.size() > 1 ? shape[1] : 1;
+        shape_[1] = shape.size() > 0 ? shape[0] : 1;
+        shape_[2] = shape.size() > 2 ? shape[2] : 1;
+        break;
+      case CNML_HCNW:
+        shape_[0] = shape.size() > 2 ? shape[2] : 1;
+        shape_[3] = shape.size() > 1 ? shape[1] : 1;
+        shape_[1] = shape.size() > 0 ? shape[0] : 1;
+        shape_[2] = shape.size() > 3 ? shape[3] : 1;
+        break;
+      case CNML_HWNC:
+        shape_[0] = shape.size() > 2 ? shape[2] : 1;
+        shape_[3] = shape.size() > 3 ? shape[3] : 1;
+        shape_[1] = shape.size() > 0 ? shape[0] : 1;
+        shape_[2] = shape.size() > 1 ? shape[1] : 1;
+        break;
+      case CNML_HWCN:
+        shape_[0] = shape.size() > 3 ? shape[3] : 1;
+        shape_[3] = shape.size() > 2 ? shape[2] : 1;
+        shape_[1] = shape.size() > 0 ? shape[0] : 1;
+        shape_[2] = shape.size() > 1 ? shape[1] : 1;
+        break;
+      case CNML_WNCH:
+        shape_[0] = shape.size() > 1 ? shape[1] : 1;
+        shape_[3] = shape.size() > 2 ? shape[2] : 1;
+        shape_[1] = shape.size() > 3 ? shape[3] : 1;
+        shape_[2] = shape.size() > 0 ? shape[0] : 1;
+        break;
+      case CNML_WNHC:
+        shape_[0] = shape.size() > 1 ? shape[1] : 1;
+        shape_[3] = shape.size() > 3 ? shape[3] : 1;
+        shape_[1] = shape.size() > 2 ? shape[2] : 1;
+        shape_[2] = shape.size() > 0 ? shape[0] : 1;
+        break;
+      case CNML_WCHN:
+        shape_[0] = shape.size() > 3 ? shape[3] : 1;
+        shape_[3] = shape.size() > 1 ? shape[1] : 1;
+        shape_[1] = shape.size() > 2 ? shape[2] : 1;
+        shape_[2] = shape.size() > 0 ? shape[0] : 1;
+        break;
+      case CNML_WCNH:
+        shape_[0] = shape.size() > 2 ? shape[2] : 1;
+        shape_[3] = shape.size() > 1 ? shape[1] : 1;
+        shape_[1] = shape.size() > 3 ? shape[3] : 1;
+        shape_[2] = shape.size() > 0 ? shape[0] : 1;
+        break;
+      case CNML_WHNC:
+        shape_[0] = shape.size() > 2 ? shape[2] : 1;
+        shape_[3] = shape.size() > 3 ? shape[3] : 1;
+        shape_[1] = shape.size() > 1 ? shape[1] : 1;
+        shape_[2] = shape.size() > 0 ? shape[0] : 1;
+        break;
+      case CNML_WHCN:
+        shape_[0] = shape.size() > 3 ? shape[3] : 1;
+        shape_[3] = shape.size() > 2 ? shape[2] : 1;
+        shape_[1] = shape.size() > 1 ? shape[1] : 1;
+        shape_[2] = shape.size() > 0 ? shape[0] : 1;
+        break;
+      case CNML_ARRAY:
+        shape_ = shape;
+        break;
+      default:
+        LOG(FATAL) << "Unsupported mluDataOrder! " << int(shape_order);
+        break;
+    }
+  } else {
+    switch (shape_order) {
+      case CNML_NCDHW:
+        shape_[0] = shape[0];
+        shape_[4] = shape[1];
+        shape_[1] = shape[2];
+        shape_[2] = shape[3];
+        shape_[3] = shape[4];
+        break;
+      case CNML_NDHWC:
+        shape_[0] = shape[0];
+        shape_[4] = shape[4];
+        shape_[1] = shape[1];
+        shape_[2] = shape[2];
+        shape_[3] = shape[3];
+        break;
+      case CNML_DHWCN:
+        shape_[0] = shape[4];
+        shape_[4] = shape[3];
+        shape_[1] = shape[0];
+        shape_[2] = shape[1];
+        shape_[3] = shape[2];
+        break;
+      case CNML_ARRAY:
+        shape_ = shape;
+        break;
+      default:
+        shape_[0] = shape[0];
+        shape_[4] = shape[1];
+        shape_[1] = shape[2];
+        shape_[2] = shape[3];
+        shape_[3] = shape[4];
+        break;
+    }
+  }
+  dim_ = shape_.size();
+}
+
+void MLUTensor::Create() {
+  if (mlu_tensor_ == nullptr) {
+    CNML_CALL(cnmlCreateTensor_V2(&mlu_tensor_, tensor_type_));
+    std::vector<int> dim_shape(shape_);
+    int* dim_strides = nullptr;
+    CNML_CALL(cnmlSetTensorShape_V2(
+        mlu_tensor_, dim_, dim_shape.data(), dim_strides));
+    CNML_CALL(cnmlSetTensorDataType(mlu_tensor_, mlu_dtype_));
+  }
+}
+
+cnmlTensor_t MLUTensor::mlu_tensor() {
+  Create();
+  return mlu_tensor_;
+}
+
+MLUTensor::~MLUTensor() {
+  if (mlu_tensor_ != nullptr) {
+    CNML_CALL(cnmlDestroyTensor(&mlu_tensor_));
+    mlu_tensor_ = nullptr;
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/bridges/tensor.h b/lite/kernels/mlu/bridges/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bb2e1b20334e359b2db0ecf1fe61e16175413dc
--- /dev/null
+++ b/lite/kernels/mlu/bridges/tensor.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/kernels/mlu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+class MLUTensor {
+ public:
+  MLUTensor()
+      : mlu_tensor_(nullptr),
+        tensor_type_(CNML_TENSOR),
+        mlu_dtype_(CNML_DATA_FLOAT32) {}
+
+  void set_mlu_ptr(void* mlu_data) { mlu_ptr_ = mlu_data; }
+
+  MLUTensor(const std::vector<int64_t>& shape,
+            cnmlTensorType_t tensor_type = CNML_TENSOR,
+            cnmlDataOrder_t data_order = CNML_NCHW,
+            cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32);
+
+  void remember(const std::vector<int>& shape,
+                cnmlTensorType_t tensor_type,
+                cnmlDataType_t mlu_dtype,
+                cnmlDataOrder_t shape_order);
+  void Create();
+  cnmlTensor_t mlu_tensor();
+  void* mlu_data() {
+    CHECK(mlu_ptr_ != nullptr);
+    return mlu_ptr_;
+  }
+
+  ~MLUTensor();
+
+ private:
+  cnmlTensor_t mlu_tensor_;
+
+  std::vector<int> shape_;
+  cnmlTensorType_t tensor_type_;
+  cnmlDataType_t mlu_dtype_;
+  int dim_{0};
+  cnmlDataOrder_t data_order_;
+  void* mlu_ptr_;
+};
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/bridges/test_helper.cc b/lite/kernels/mlu/bridges/test_helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf2d7bd6c1ec5634bb0d7556a16166ac0b0bcb45
--- /dev/null
+++ b/lite/kernels/mlu/bridges/test_helper.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include <utility>
+#include "lite/core/device_info.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/mlu/subgraph_compute.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+void LaunchOp(const std::shared_ptr<lite::OpLite> op,
+              const std::vector<std::string>& input_var_names,
+              const std::vector<std::string>& output_var_names) {
+  CNRT_CALL(cnrtInit(0));
+  SetMluDevice(0);
+  cnrtQueue_t queue_;
+  cnrtInvokeFuncParam_t forward_param;
+  u32_t affinity = 1;
+  int data_param = 1;
+  forward_param.data_parallelism = &data_param;
+  forward_param.affinity = &affinity;
+  forward_param.end = CNRT_PARAM_END;
+  CNRT_CALL(cnrtCreateQueue(&queue_));
+  cnrtDev_t dev_handle;
+  CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, 0));
+  CNRT_CALL(cnrtSetCurrentDevice(dev_handle));
+  auto scope = op->scope();
+  auto op_type = op->op_info()->Type();
+  paddle::lite::subgraph::mlu::Graph graph;
+  // convert op to IR graph
+  const auto& bridges = subgraph::Registry::Instance();
+  CHECK(bridges.Exists(op_type, TARGET(kMLU)));
+
+  // Convert all of input data vars and added into the MLU IR graph
+  for (auto& input_name : input_var_names) {
+    auto input_tensor = scope->FindMutableTensor(input_name);
+    CHECK(input_tensor);
+    Tensor temp_input;
+    temp_input.Resize(input_tensor->dims().Vectorize());
+    temp_input.CopyDataFrom(*input_tensor);
+    auto input_node =
+        graph.AddNode(input_name,
+                      input_tensor->dims().Vectorize(),
+                      CNML_TENSOR,
+                      CNML_NHWC,
+                      graph.FPType(),
+                      reinterpret_cast<void*>(
+                          input_tensor->mutable_data<float>(TARGET(kMLU))));
+    CHECK(input_node);
+    CNRT_CHECK(cnrtMemcpy(input_tensor->mutable_data<float>(),
+                          temp_input.mutable_data<float>(),
+                          sizeof(float) * input_tensor->dims().production(),
+                          CNRT_MEM_TRANS_DIR_HOST2DEV));
+  }
+  bridges.Select(op_type, TARGET(kMLU))(
+      reinterpret_cast<void*>(&graph), const_cast<OpLite*>(op.get()), nullptr);
+
+  for (auto& output_name : output_var_names) {
+    if (graph.HasNode(output_name)) {
+      graph.AddOutput(graph.GetNode(output_name));
+    }
+    auto output_tensor = scope->FindMutableTensor(output_name);
+    void* p_data =
+        static_cast<void*>(output_tensor->mutable_data<float>(TARGET(kMLU)));
+    auto node = graph.GetNode(output_name);
+    CHECK(p_data);
+    node->set_mlu_ptr(p_data);
+  }
+  for (auto& input_name : input_var_names) {
+    graph.AddInput(graph.GetNode(input_name));
+  }
+
+  graph.Compile(CNML_MLU270, 1);
+
+  graph.Compute(forward_param, queue_);
+  for (auto& output_name : output_var_names) {
+    auto output_tensor = scope->FindMutableTensor(output_name);
+    Tensor temp_out;
+    temp_out.Resize(output_tensor->dims().Vectorize());
+    CNRT_CHECK(cnrtMemcpy(temp_out.mutable_data<float>(TARGET(kHost)),
+                          output_tensor->mutable_data<float>(),
+                          sizeof(float) * output_tensor->dims().production(),
+                          CNRT_MEM_TRANS_DIR_DEV2HOST));
+    output_tensor->mutable_data<float>(TARGET(kHost));
+    output_tensor->CopyDataFrom(temp_out);
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+// USE_LITE_OP(graph_op);
+// USE_LITE_KERNEL(graph_op, kMLU, kFloat, kNHWC, def);
diff --git a/lite/kernels/mlu/bridges/test_helper.h b/lite/kernels/mlu/bridges/test_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..4da9e72dfcc5a81a68467f7622e2c16aedb2ded5
--- /dev/null
+++ b/lite/kernels/mlu/bridges/test_helper.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <random>
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+template <typename T>
+std::shared_ptr<T> CreateOp(const cpp::OpDesc& opdesc, lite::Scope* scope) {
+  auto op = std::make_shared<T>(opdesc.Type());
+  op->SetValidPlaces(
+      {Place{TARGET(kHost), PRECISION(kFloat)},
+       Place{TARGET(kX86), PRECISION(kFloat)},
+       Place{TARGET(kMLU), PRECISION(kFloat), DATALAYOUT(kNHWC)}});
+  CHECK(op->Attach(opdesc, scope));
+  CHECK(op->CheckShape());
+  CHECK(op->InferShape());
+  return op;
+}
+
+// T is the target data type
+// R is the range data type, e.g. int, half
+template <typename T, typename R = float>
+void FillTensor(Tensor* x,
+                T lower = static_cast<T>(-2),
+                T upper = static_cast<T>(2)) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+
+  T* x_data = x->mutable_data<T>();
+  for (int i = 0; i < x->dims().production(); ++i) {
+    auto r = uniform_dist(rng) * (upper - lower) + lower;
+    x_data[i] = static_cast<T>(static_cast<R>(r));
+  }
+}
+
+void LaunchOp(const std::shared_ptr<lite::OpLite> op,
+              const std::vector<std::string>& input_var_names,
+              const std::vector<std::string>& output_var_names);
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/bridges/utility.cc b/lite/kernels/mlu/bridges/utility.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f18a46518c09a69803a069ce40c1d7e3c01e9eca
--- /dev/null
+++ b/lite/kernels/mlu/bridges/utility.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/utility.h"
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+void transpose(float* input_data,
+               float* output_data,
+               std::vector<int> input_shape,
+               std::vector<int> axis) {
+  int old_index = -1;
+  int new_index = -1;
+  int dim[4] = {0};
+  std::vector<int> shape = input_shape;
+  for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
+    for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
+      for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
+        for (dim[3] = 0; dim[3] < input_shape[3]; dim[3]++) {
+          old_index = dim[0] * shape[1] * shape[2] * shape[3] +
+                      dim[1] * shape[2] * shape[3] + dim[2] * shape[3] + dim[3];
+          new_index =
+              dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] +
+              dim[axis[1]] * shape[axis[2]] * shape[axis[3]] +
+              dim[axis[2]] * shape[axis[3]] + dim[axis[3]];
+          output_data[new_index] = input_data[old_index];
+        }
+      }
+    }
+  }
+}
+
+int scale2position(float scale) { return static_cast<int>(-std::log2(scale)); }
+
+void dequant(float* dst, int8_t* src, size_t size, float scale) {
+  for (size_t i = 0; i < size; ++i) {
+    dst[i] = static_cast<float>(src[i]) * scale;
+  }
+}
+
+void dequant(float* dst,
+             int8_t* src,
+             size_t size_o,
+             size_t size,
+             size_t size_in,
+             std::vector<float> scales) {
+  for (int out = 0; out < size_o; ++out) {
+    for (int s = 0; s < size; ++s) {
+      auto scale = scales[s];
+      for (int in = 0; in < size_in; ++in) {
+        int idx = in + s * size_in + out * size_in * size;
+        dst[idx] = static_cast<float>(src[idx]) * scale;
+      }
+    }
+  }
+}
+
+cnmlActiveFunction_t OpTypeToCNMLActType(std::string op_type) {
+  if (op_type == "relu") {
+    return CNML_ACTIVE_RELU;
+  } else if (op_type == "sigmoid") {
+    return CNML_ACTIVE_SIGMOID;
+  } else if (op_type == "tanh") {
+    return CNML_ACTIVE_TANH;
+  } else if (op_type == "relu1") {
+    return CNML_ACTIVE_RELU1;
+  } else if (op_type == "relu6") {
+    return CNML_ACTIVE_RELU6;
+  } else if (op_type == "hard_sigmoid") {
+    return CNML_ACTIVE_HARD_SIGMOID;
+  }
+  LOG(FATAL) << "CNML Unspoorted op type " << op_type;
+  return CNML_ACTIVE_NONE;
+}
+
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname) {
+  auto iarg_names = op_info->input_argnames();
+  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
+      iarg_names.end()) {
+    auto inputs = op_info->Input(argname);
+    if (inputs.empty()) {
+      return false;
+    }
+    auto var_name = inputs.front();
+    auto var = scope->FindVar(var_name);
+    return var != nullptr;
+  } else {
+    return false;
+  }
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/bridges/utility.h b/lite/kernels/mlu/bridges/utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..2af8274e07713300277f7280f12e6d1fcb47c3c2
--- /dev/null
+++ b/lite/kernels/mlu/bridges/utility.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cnml.h>
+#include <cnrt.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/backends/mlu/mlu_utils.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+#include "lite/fluid/data_type.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+void transpose(float* input_data,
+               float* output_data,
+               std::vector<int> input_shape,
+               std::vector<int> axis);
+int scale2position(float scale);
+void dequant(float* dst, int8_t* src, size_t size, float scale);
+
+void dequant(float* dst,
+             int8_t* src,
+             size_t size_o,
+             size_t size,
+             size_t size_in,
+             std::vector<float> scales);
+
+template <typename T>
+std::vector<T> recip(std::vector<T> x);
+// Type/tensor converters for converting Paddle type/tensor to MLU type/tensor
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname);
+
+cnmlActiveFunction_t OpTypeToCNMLActType(std::string op_type);
+
+inline const ::paddle::lite::DDimLite DimNHWC2NCHW(
+    const ::paddle::lite::DDimLite& dim) {
+  return ::paddle::lite::DDimLite(
+      std::vector<int64_t>({dim[0], dim[3], dim[1], dim[2]}));
+}
+
+inline const ::paddle::lite::DDimLite DimNCHW2NHWC(
+    const ::paddle::lite::DDimLite& dim) {
+  return ::paddle::lite::DDimLite(
+      std::vector<int64_t>({dim[0], dim[2], dim[3], dim[1]}));
+}
+
+inline const std::vector<int64_t> DimNHWC2NCHW(
+    const std::vector<int64_t>& dim) {
+  return std::vector<int64_t>({dim[0], dim[3], dim[1], dim[2]});
+}
+
+inline const std::vector<int64_t> DimNCHW2NHWC(
+    const std::vector<int64_t>& dim) {
+  return std::vector<int64_t>({dim[0], dim[2], dim[3], dim[1]});
+}
+
+template <paddle::lite_api::PrecisionType>
+struct FPTypeTraits {};
+
+template <>
+struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
+  typedef float T;
+};
+
+template <>
+struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
+  typedef ::paddle::lite::fluid::float16 T;
+};
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/calib_compute.cc b/lite/kernels/mlu/calib_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a3be9968bd0aeeb02541374a8ce390e3601ba22f
--- /dev/null
+++ b/lite/kernels/mlu/calib_compute.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/calib_compute.h"
+#include <vector>
+#include "lite/backends/arm/math/type_trans.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace mlu {
+
+void CalibComputeFp32ToInt8::Run() {
+  // auto& param = this->Param<operators::CalibParam>();
+  // std::vector<float> scale = {param.scale};
+  // const auto* din = param.input->data<float>();
+  // auto* dout = param.output->mutable_data<signed char>();
+  // lite::arm::math::fp32_to_int8(
+  //     din, dout, scale.data(), 1, 1, param.input->numel());
+  // return;
+}
+
+void CalibComputeInt8ToFp32::Run() {
+  // auto& param = this->Param<operators::CalibParam>();
+  // const auto* din = param.input->data<signed char>();
+  // std::vector<float> scale = {param.scale};
+  // auto* dout = param.output->mutable_data<float>();
+  // lite::arm::math::int8_to_fp32(
+  //     din, dout, scale.data(), 1, 1, param.input->numel());
+  // return;
+}
+
+}  // namespace mlu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(calib,
+                     kMLU,
+                     kInt8,
+                     kNCHW,
+                     paddle::lite::kernels::mlu::CalibComputeFp32ToInt8,
+                     fp32_to_int8)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kInt8))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(calib,
+                     kMLU,
+                     kInt8,
+                     kNCHW,
+                     paddle::lite::kernels::mlu::CalibComputeInt8ToFp32,
+                     int8_to_fp32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kInt8))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kFloat))})
+    .Finalize();
+REGISTER_LITE_KERNEL(calib_once,
+                     kMLU,
+                     kInt8,
+                     kNCHW,
+                     paddle::lite::kernels::mlu::CalibComputeFp32ToInt8,
+                     fp32_to_int8)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kInt8))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(calib_once,
+                     kMLU,
+                     kInt8,
+                     kNCHW,
+                     paddle::lite::kernels::mlu::CalibComputeInt8ToFp32,
+                     int8_to_fp32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kInt8))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU), PRECISION(kFloat))})
+    .Finalize();
diff --git a/lite/kernels/mlu/calib_compute.h b/lite/kernels/mlu/calib_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c5988c165c69b488dc653150a596fb96e45cde3
--- /dev/null
+++ b/lite/kernels/mlu/calib_compute.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/operators/calib_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace mlu {
+
+class CalibComputeFp32ToInt8
+    : public KernelLite<TARGET(kMLU), PRECISION(kInt8)> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeFp32ToInt8() override{};
+
+ private:
+};
+
+class CalibComputeInt8ToFp32
+    : public KernelLite<TARGET(kMLU), PRECISION(kInt8)> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeInt8ToFp32() override{};
+
+ private:
+};
+
+}  // namespace mlu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/io_copy_compute.cc b/lite/kernels/mlu/io_copy_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc6e1838d70383edb3dcc65d7a9b0f627719e963
--- /dev/null
+++ b/lite/kernels/mlu/io_copy_compute.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2019 Cambricon Authors. All Rights Reserved.
+
+#include <Eigen/Core>
+#include "lite/backends/mlu/target_wrapper.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace mlu {
+
+using TargetW = TargetWrapper<TARGET(kMLU)>;
+
+// Host to MLU memory.
+void CopyFromHostSync(void* target, const void* source, size_t size) {
+  TargetW::MemcpySync(target, source, size, IoDirection::HtoD);
+}
+
+// MLU to Host memory.
+void CopyToHostSync(void* target, const void* source, size_t size) {
+  TargetW::MemcpySync(target, source, size, IoDirection::DtoH);
+}
+
+/*
+ * This kernel copies a tensor from host to MLU space.
+ */
+template <PrecisionType Precision>
+class IoCopyHostToMluCompute
+    : public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
+ public:
+  using handler_t = KernelBase::type_infer_handler_t;
+  using param_t = operators::IoCopyParam;
+
+  void Run() override {
+    auto& param = this->template Param<param_t>();
+    CHECK(param.x->target() == TARGET(kHost) ||
+          param.x->target() == TARGET(kX86));
+    auto mem_size = param.x->memory_size();
+    // LOG(INFO) << "copy size " << mem_size;
+    auto* data = param.y->mutable_data(TARGET(kMLU), mem_size);
+    CopyFromHostSync(data, param.x->raw_data(), mem_size);
+  }
+
+  std::unique_ptr<handler_t> GetTypeInferHandler() override {
+    std::unique_ptr<handler_t> res(new handler_t);
+    *res = [](const std::map<std::string, const Type*>& inputs,
+              const std::string& out) -> const Type* {
+      CHECK(!inputs.empty());
+      auto* type = inputs.at("Input");
+      CHECK(type->target() == TARGET(kHost));
+
+      auto out_place = type->place();
+      out_place.target = TARGET(kMLU);
+      auto* out_type = Type::Get(type->id(),
+                                 out_place.target,
+                                 out_place.precision,
+                                 out_place.layout,
+                                 out_place.device);
+      return out_type;
+    };
+    return res;
+  }
+
+  std::string doc() const override { return "Copy IO from HOST to MLU"; }
+};
+
+/*
+ * This kernel copies a tensor from MLU to host space.
+ */
+template <PrecisionType Precision>
+class IoCopyMluToHostCompute
+    : public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
+ public:
+  void Run() override {
+    auto& param = this->template Param<operators::IoCopyParam>();
+    CHECK(param.x->target() == TARGET(kMLU));
+    auto mem_size = param.x->memory_size();
+    auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
+    CopyToHostSync(data, param.x->raw_data(), mem_size);
+  }
+
+  std::string doc() const override { return "Copy IO from MLU to HOST"; }
+};
+
+}  // namespace mlu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    io_copy,
+    kMLU,
+    kFloat,
+    kNHWC,
+    paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kFloat)>,
+    host_to_device_kFloat)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    io_copy,
+    kMLU,
+    kFP16,
+    kNHWC,
+    paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kFP16)>,
+    host_to_device_kFP16)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    io_copy,
+    kMLU,
+    kFloat,
+    kNHWC,
+    paddle::lite::kernels::mlu::IoCopyMluToHostCompute<PRECISION(kFloat)>,
+    device_to_host_kFloat)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    io_copy,
+    kMLU,
+    kFP16,
+    kNHWC,
+    paddle::lite::kernels::mlu::IoCopyMluToHostCompute<PRECISION(kFP16)>,
+    device_to_host_kFP16)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
+
+//                     kMLU,
+//                     kFloat,
+//                     kNHWC,
+//                     paddle::lite::kernels::mlu::IoCopyHostToMluCompute,
+//                     host_to_device)
+//    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
+//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
+//    .Finalize();
+//
+//
+//                     kMLU,
+//                     kFloat,
+//                     kNHWC,
+//                     paddle::lite::kernels::mlu::IoCopyMluToHostCompute,
+//                     device_to_host)
+//    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
+//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+//    .Finalize();
diff --git a/lite/kernels/mlu/subgraph_compute.cc b/lite/kernels/mlu/subgraph_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..73ca9dcc20a6311d33e5cff6c6ed6be08f3c7a1f
--- /dev/null
+++ b/lite/kernels/mlu/subgraph_compute.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.ddNod
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/subgraph_compute.h"
+#include <sys/time.h>
+#include <time.h>
+#include <utility>
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/kernels/mlu/bridges/paddle_use_bridges.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace mlu {}  // namespace mlu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    subgraph,
+    kMLU,
+    kFloat,
+    kNHWC,
+    paddle::lite::kernels::mlu::SubgraphCompute<PRECISION(kFloat)>,
+    def_kFloat)
+    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    subgraph,
+    kMLU,
+    kFP16,
+    kNHWC,
+    paddle::lite::kernels::mlu::SubgraphCompute<PRECISION(kFP16)>,
+    def_FP16)
+    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .Finalize();
diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..06fc791fe7d07ba759e2ed0f9c6187432e195186
--- /dev/null
+++ b/lite/kernels/mlu/subgraph_compute.h
@@ -0,0 +1,168 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_place.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/core/types.h"
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/engine.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace mlu {
+
+template <PrecisionType Precision>
+class SubgraphEngine : public subgraph::Engine {
+ public:
+  SubgraphEngine(KernelContext* ctx,
+                 int block_idx,
+                 cpp::BlockDesc* block_desc,
+                 const std::vector<std::string>& input_names,
+                 const std::vector<std::string>& output_names,
+                 Scope* scope,
+                 ::paddle::lite_api::PrecisionType type)
+      : subgraph::Engine(
+            ctx, block_idx, block_desc, input_names, output_names, scope) {
+    graph_.SetFPType(type);
+  }
+
+ protected:
+  int BuildDeviceProgram() override {
+    int status = 0;
+    // Convert all of input data vars and added into the MLU IR graph
+    for (auto& input_name : input_names_) {
+      auto input_tensor = scope_->FindMutableTensor(input_name);
+      CHECK(input_tensor);
+      auto input_node =
+          graph_.AddNode(input_name,
+                         input_tensor->dims().Vectorize(),
+                         CNML_TENSOR,
+                         CNML_NHWC,
+                         graph_.FPType(),
+                         const_cast<void*>(input_tensor->raw_data()));
+      CHECK(input_node);
+      // MLU doesn't support dynamic dimensions/shapes, so need to rebuild
+      // the program when the shape of any input tensor is changed.
+      status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
+    }
+    LOG(INFO) << "START TO CONVERT ";
+    // Convert all of ops and its weights and added into the MLU IR graph
+    const auto& bridges = subgraph::Registry::Instance();
+    for (auto& inst : origin_program_) {
+      auto op = inst.op();
+      CHECK(op);
+      op->CheckShape();
+      op->InferShape();
+      std::string op_type = op->op_info()->Type();
+      if (!bridges.Exists(op_type, TARGET(kMLU))) {
+        LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type;
+        return subgraph::FAILED;
+      }
+      auto kernel = inst.kernel();
+      status |= bridges.Select(op_type, TARGET(kMLU))(
+          reinterpret_cast<void*>(&graph_),
+          const_cast<OpLite*>(op),
+          const_cast<KernelBase*>(kernel));
+      if (subgraph::CHECK_FAILED(status)) {
+        return subgraph::FAILED;
+      }
+    }
+    // Obtain the output nodes of the MLU IR graph and build the graph to MLU
+    // runtime
+    std::vector<std::string> valid_output_names;
+    for (auto& output_name : output_names_) {
+      if (graph_.HasNode(output_name)) {
+        graph_.AddOutput(graph_.GetNode(output_name));
+        auto output_tensor = scope_->FindMutableTensor(output_name);
+        void* p_data = static_cast<void*>(
+            output_tensor->mutable_data<typename ::paddle::lite::subgraph::mlu::
+                                            FPTypeTraits<Precision>::T>(
+                TARGET(kMLU)));
+        auto node = graph_.GetNode(output_name);
+        CHECK(p_data);
+        node->set_mlu_ptr(p_data);
+        valid_output_names.push_back(output_name);
+      }
+    }
+    for (auto& input_name : input_names_) {
+      graph_.AddInput(graph_.GetNode(input_name));
+    }
+    CHECK(!valid_output_names.empty()) << "[MLU] no valid output names";
+    // auto& mlu_context = this->ctx_->template As<MLUContext>();
+    // auto core_version = mlu_context.MLUCoreVersion();
+    // auto core_number = mlu_context.MLUCoreNumber();
+    // graph_.Compile(core_version, core_number);
+    return status;
+  }
+
+  int LaunchDeviceProgram() override {
+    // auto& mlu_context = this->ctx_->template As<MLUContext>();
+    // auto exec_queue = mlu_context.exec_queue();
+    // u32_t affinity = mlu_context.affinity();
+    // cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
+    // int data_param = 1;
+    // forward_param.data_parallelism = &data_param;
+    // forward_param.affinity = &affinity;
+    // forward_param.end = CNRT_PARAM_END;
+    // graph_.Compute(forward_param, exec_queue);
+    return 0;
+  }
+
+  paddle::lite::subgraph::mlu::Graph graph_;
+};
+
+template <PrecisionType Precision>
+class SubgraphCompute
+    : public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::SubgraphParam;
+
+  void PrepareForRun() override {
+    auto& param = this->template Param<param_t>();
+    // LOG(INFO) << "SUBGRAP Prepare RUN index " << param.sub_block_idx;
+    engine_.reset(new SubgraphEngine<Precision>(this->ctx_.get(),
+                                                param.sub_block_idx,
+                                                param.sub_block_desc,
+                                                param.input_data_names,
+                                                param.output_data_names,
+                                                param.scope,
+                                                this->precision()));
+    CHECK(engine_);
+    engine_->Build();
+  }
+
+  void Run() override {
+    CHECK(engine_);
+    engine_->Launch();
+  }
+
+  virtual ~SubgraphCompute() = default;
+
+ private:
+  std::unique_ptr<SubgraphEngine<Precision>> engine_;
+};
+
+}  // namespace mlu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt
index 1dd5ff60ccddaf1f2f35ae59d84f432a564c9443..bcf6ba63eb820ee187dd26b2722686a768f78c98 100644
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -36,14 +36,18 @@ lite_cc_library(subgraph_bridge_split_op_npu SRCS split_op.cc DEPS ${npu_subgrap
 lite_cc_library(subgraph_bridge_concat_op_npu SRCS concat_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_shuffle_channel_op_npu SRCS shuffle_channel_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_pad2d_op_npu SRCS pad2d_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_library(subgraph_bridge_square_op_npu SRCS square_op.cc DEPS ${npu_subgraph_bridge_deps})
-lite_cc_library(subgraph_bridge_sqrt_op_npu SRCS sqrt_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_reduce_mean_op_npu SRCS reduce_mean_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_unsqueeze_op_npu SRCS unsqueeze_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_argmax_op_npu SRCS argmax_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_instance_norm_op_npu SRCS instance_norm_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_dropout_op_npu SRCS dropout_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_topk_op_npu SRCS topk_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_layer_norm_op_npu SRCS layer_norm_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_fill_constant_op_npu SRCS fill_constant_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_fill_constant_batch_size_like_op_npu SRCS fill_constant_batch_size_like_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_increment_op_npu SRCS increment_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_compare_op_npu SRCS compare_op.cc DEPS ${npu_subgraph_bridge_deps})
+
 
 set(npu_subgraph_bridges
         subgraph_bridge_registry
@@ -67,14 +71,17 @@ set(npu_subgraph_bridges
         subgraph_bridge_concat_op_npu
         subgraph_bridge_shuffle_channel_op_npu
         subgraph_bridge_pad2d_op_npu
-        subgraph_bridge_square_op_npu
-        subgraph_bridge_sqrt_op_npu
         subgraph_bridge_reduce_mean_op_npu
         subgraph_bridge_unsqueeze_op_npu
         subgraph_bridge_argmax_op_npu
         subgraph_bridge_instance_norm_op_npu
         subgraph_bridge_dropout_op_npu
+        subgraph_bridge_topk_op_npu
         subgraph_bridge_layer_norm_op_npu
+        subgraph_bridge_fill_constant_op_npu
+        subgraph_bridge_fill_constant_batch_size_like_op_npu
+        subgraph_bridge_increment_op_npu
+        subgraph_bridge_compare_op_npu
         CACHE INTERNAL "npu_subgraph_bridges")
 
 message(STATUS "+++++ npu_subgraph_bridges: ${npu_subgraph_bridges}")
diff --git a/lite/kernels/npu/bridges/act_op.cc b/lite/kernels/npu/bridges/act_op.cc
index a4d1009f1be286e8bd8dfcdd469ff53b6681c820..db9a652b6c1b4055e09a70e1f407b1027fd1b1e8 100644
--- a/lite/kernels/npu/bridges/act_op.cc
+++ b/lite/kernels/npu/bridges/act_op.cc
@@ -21,6 +21,7 @@ namespace lite {
 namespace subgraph {
 namespace npu {
 
+template <typename ActType>
 int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK(ctx != nullptr);
   CHECK(op != nullptr);
@@ -32,15 +33,43 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindTensor(x_name);
+
+  auto out_name = op_info->Output("Out").front();
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Act node
+  auto act_node = graph->template Add<ActType>(out_name);
+  auto act_op = act_node->template data<ActType>();
+  act_op->set_input_x(*x_node->data());
+
+  return SUCCESS;
+}
+
+template <>
+int ActConverter<ge::op::Activation>(void* ctx,
+                                     OpLite* op,
+                                     KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
 
   // X node
   std::shared_ptr<Node> x_node = nullptr;
@@ -51,8 +80,8 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   }
 
   // Act node
-  auto act_node = graph->Add<ge::op::Activation>(out_name);
-  auto act_op = act_node->data<ge::op::Activation>();
+  auto act_node = graph->template Add<ge::op::Activation>(out_name);
+  auto act_op = act_node->template data<ge::op::Activation>();
   act_op->set_input_x(*x_node->data());
   // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu,
   // clipped_relu etc.
@@ -80,27 +109,42 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(sigmoid,
-                         kNPU,
-                         paddle::lite::subgraph::npu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(relu, kNPU, paddle::lite::subgraph::npu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(tanh, kNPU, paddle::lite::subgraph::npu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(relu_clipped,
-                         kNPU,
-                         paddle::lite::subgraph::npu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(relu6,
-                         kNPU,
-                         paddle::lite::subgraph::npu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
-                         kNPU,
-                         paddle::lite::subgraph::npu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(abs, kNPU, paddle::lite::subgraph::npu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(softsign,
-                         kNPU,
-                         paddle::lite::subgraph::npu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(softplus,
-                         kNPU,
-                         paddle::lite::subgraph::npu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(hard_sigmoid,
-                         kNPU,
-                         paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(
+    sigmoid,
+    kNPU,
+    paddle::lite::subgraph::npu::ActConverter<ge::op::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    relu, kNPU, paddle::lite::subgraph::npu::ActConverter<ge::op::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    tanh, kNPU, paddle::lite::subgraph::npu::ActConverter<ge::op::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    relu_clipped,
+    kNPU,
+    paddle::lite::subgraph::npu::ActConverter<ge::op::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    relu6, kNPU, paddle::lite::subgraph::npu::ActConverter<ge::op::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    leaky_relu,
+    kNPU,
+    paddle::lite::subgraph::npu::ActConverter<ge::op::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    abs, kNPU, paddle::lite::subgraph::npu::ActConverter<ge::op::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    softsign,
+    kNPU,
+    paddle::lite::subgraph::npu::ActConverter<ge::op::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    softplus,
+    kNPU,
+    paddle::lite::subgraph::npu::ActConverter<ge::op::Activation>);
+REGISTER_SUBGRAPH_BRIDGE(
+    hard_sigmoid,
+    kNPU,
+    paddle::lite::subgraph::npu::ActConverter<ge::op::Activation>);
+
+REGISTER_SUBGRAPH_BRIDGE(
+    log, kNPU, paddle::lite::subgraph::npu::ActConverter<ge::op::Log>);
+REGISTER_SUBGRAPH_BRIDGE(
+    square, kNPU, paddle::lite::subgraph::npu::ActConverter<ge::op::Square>);
+REGISTER_SUBGRAPH_BRIDGE(
+    sqrt, kNPU, paddle::lite::subgraph::npu::ActConverter<ge::op::Sqrt>);
diff --git a/lite/kernels/npu/bridges/argmax_op.cc b/lite/kernels/npu/bridges/argmax_op.cc
index 3d397aab9d5cc7cfb800198184d656856d8c101f..4b1e45c3d26ecb713b231a39924d18ba794eec2b 100644
--- a/lite/kernels/npu/bridges/argmax_op.cc
+++ b/lite/kernels/npu/bridges/argmax_op.cc
@@ -32,15 +32,9 @@ int ArgmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   int axis = op_info->GetAttr<int64_t>("axis");
 
   // X node
diff --git a/lite/kernels/npu/bridges/batch_norm_op.cc b/lite/kernels/npu/bridges/batch_norm_op.cc
index d0e97161c5f1bc6b126e81b969a4564b47da9331..1911b9fd88dfc75a57adbe31c2c94e71d357a04e 100644
--- a/lite/kernels/npu/bridges/batch_norm_op.cc
+++ b/lite/kernels/npu/bridges/batch_norm_op.cc
@@ -32,35 +32,17 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto scale_name = op_info->Input("Scale").front();
-  auto scale_type = kernel->GetInputDeclType("Scale");
-  CHECK(scale_type->precision() == PRECISION(kFloat));
-  CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
   auto scale = scope->FindMutableTensor(scale_name);
   auto bias_name = op_info->Input("Bias").front();
-  auto bias_type = kernel->GetInputDeclType("Bias");
-  CHECK(bias_type->precision() == PRECISION(kFloat));
-  CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
   auto bias = scope->FindMutableTensor(bias_name);
   auto mean_name = op_info->Input("Mean").front();
-  auto mean_type = kernel->GetInputDeclType("Mean");
-  CHECK(mean_type->precision() == PRECISION(kFloat));
-  CHECK(mean_type->layout() == DATALAYOUT(kNCHW));
   auto mean = scope->FindMutableTensor(mean_name);
   auto variance_name = op_info->Input("Variance").front();
-  auto variance_type = kernel->GetInputDeclType("Variance");
-  CHECK(variance_type->precision() == PRECISION(kFloat));
-  CHECK(variance_type->layout() == DATALAYOUT(kNCHW));
   auto variance = scope->FindMutableTensor(variance_name);
   auto y_name = op_info->Output("Y").front();
-  auto y_type = kernel->GetOutputDeclType("Y");
-  CHECK(y_type->precision() == PRECISION(kFloat));
-  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
   float momentum = op_info->GetAttr<float>("momentum");
   float epsilon = op_info->GetAttr<float>("epsilon");
   int mode = 1;  // bnScale, bnBias tensor dims are 1xCx1x1
diff --git a/lite/kernels/npu/bridges/square_op.cc b/lite/kernels/npu/bridges/compare_op.cc
similarity index 61%
rename from lite/kernels/npu/bridges/square_op.cc
rename to lite/kernels/npu/bridges/compare_op.cc
index f03c7690cb490556fe6b26a132454ca109f41310..180e560de15460e5fa903c066d9f84659529b6db 100644
--- a/lite/kernels/npu/bridges/square_op.cc
+++ b/lite/kernels/npu/bridges/compare_op.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@ namespace lite {
 namespace subgraph {
 namespace npu {
 
-int SquareConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+int LessThanConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK(ctx != nullptr);
   CHECK(op != nullptr);
   auto graph = static_cast<Graph*>(ctx);
@@ -30,17 +30,16 @@ int SquareConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto scope = op->scope();
   VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  // Get input and output vars and op attributes
+  // Get input, output and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
-  auto x = scope->FindMutableTensor(x_name);
+  auto x = scope->FindTensor(x_name);
   auto x_dims = x->dims();
+
+  auto y_name = op_info->Input("Y").front();
+  auto y = scope->FindTensor(y_name);
+  auto y_dims = y->dims();
+
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
 
   // X node
   std::shared_ptr<Node> x_node = nullptr;
@@ -50,11 +49,21 @@ int SquareConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     x_node = graph->Add(x_name, *x);
   }
 
-  // Square node
-  auto square_node = graph->Add<ge::op::Square>(out_name);
-  auto square_op = square_node->data<ge::op::Square>();
-  square_op->set_input_x(*x_node->data());
-  return SUCCESS;
+  // Y node
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    y_node = graph->Get(y_name);
+  } else {
+    y_node = graph->Add(y_name, *y);
+  }
+
+  // add node
+  auto less_than_node = graph->Add<ge::op::Less>(out_name, PRECISION(kBool));
+  auto less_than_op = less_than_node->data<ge::op::Less>();
+  less_than_op->set_input_x1(*x_node->data());
+  less_than_op->set_input_x2(*y_node->data());
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
 }  // namespace npu
@@ -62,6 +71,6 @@ int SquareConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(square,
+REGISTER_SUBGRAPH_BRIDGE(less_than,
                          kNPU,
-                         paddle::lite::subgraph::npu::SquareConverter);
+                         paddle::lite::subgraph::npu::LessThanConverter);
diff --git a/lite/kernels/npu/bridges/concat_op.cc b/lite/kernels/npu/bridges/concat_op.cc
index e40af8703dd1dda7303f0976fa03abec7cdf7aaa..2214881f0599214dba75aa197b6c01863458e8ad 100644
--- a/lite/kernels/npu/bridges/concat_op.cc
+++ b/lite/kernels/npu/bridges/concat_op.cc
@@ -32,13 +32,7 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_names = op_info->Input("X");
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto axis = op_info->GetAttr<int>("axis");
   auto num = x_names.size();
 
diff --git a/lite/kernels/npu/bridges/conv_op.cc b/lite/kernels/npu/bridges/conv_op.cc
index cc72242fb125699aa6236b78ebd17c32dd1dc66a..f21e5618b0d8b2e0e7ed4aec0b1bc9b16c4877d9 100644
--- a/lite/kernels/npu/bridges/conv_op.cc
+++ b/lite/kernels/npu/bridges/conv_op.cc
@@ -33,23 +33,14 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto input_name = op_info->Input("Input").front();
-  auto input_type = kernel->GetInputDeclType("Input");
-  CHECK(input_type->precision() == PRECISION(kFloat));
-  CHECK(input_type->layout() == DATALAYOUT(kNCHW));
   auto input = scope->FindMutableTensor(input_name);
   auto input_dims = input->dims();
 
   auto filter_name = op_info->Input("Filter").front();
-  auto filter_type = kernel->GetInputDeclType("Filter");
-  CHECK(filter_type->precision() == PRECISION(kFloat));
-  CHECK(filter_type->layout() == DATALAYOUT(kNCHW));
   auto filter = scope->FindMutableTensor(filter_name);
   auto filter_dims = filter->dims();
 
   auto output_name = op_info->Output("Output").front();
-  auto output_type = kernel->GetOutputDeclType("Output");
-  CHECK(output_type->precision() == PRECISION(kFloat));
-  CHECK(output_type->layout() == DATALAYOUT(kNCHW));
   auto output = scope->FindMutableTensor(output_name);
   auto output_dims = output->dims();
 
@@ -132,9 +123,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     if (graph->Has(bias_name)) {
       bias_node = graph->Get(bias_name);
     } else {
-      auto bias_type = kernel->GetInputDeclType("Bias");
-      CHECK(bias_type->precision() == PRECISION(kFloat));
-      CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
       auto bias = scope->FindMutableTensor(bias_name);
       auto bias_dims = bias->dims();
       auto bias_data_size = bias_dims.production();
@@ -232,6 +220,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     act_op->set_attr_mode(CvtActMode(act_type));
     if (act_type == "leaky_relu") {
       act_op->set_attr_negative_slope(leaky_relu_alpha);
+    } else if (act_type == "relu6") {
+      act_op->set_attr_coef(6.f);
     }
   }
 
diff --git a/lite/kernels/npu/bridges/conv_transpose_op.cc b/lite/kernels/npu/bridges/conv_transpose_op.cc
index adade8844b65cef560b6f183ea0e2a63f05ccb6b..da91ae125b8e8ffca4c70aba85e9aefe0d11e431 100644
--- a/lite/kernels/npu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op.cc
@@ -33,25 +33,16 @@ int ConvTransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input, output and op attributes
   auto input_name = op_info->Input("Input").front();
-  auto input_type = kernel->GetInputDeclType("Input");
-  CHECK(input_type->precision() == PRECISION(kFloat));
-  CHECK(input_type->layout() == DATALAYOUT(kNCHW));
   auto input = scope->FindMutableTensor(input_name);
   auto input_dims = input->dims();
   CHECK_EQ(input_dims.size(), 4);
 
   auto filter_name = op_info->Input("Filter").front();
-  auto filter_type = kernel->GetInputDeclType("Filter");
-  CHECK(filter_type->precision() == PRECISION(kFloat));
-  CHECK(filter_type->layout() == DATALAYOUT(kNCHW));
   auto filter = scope->FindMutableTensor(filter_name);
   auto filter_dims = filter->dims();
   CHECK_EQ(filter_dims.size(), 4);
 
   auto output_name = op_info->Output("Output").front();
-  auto output_type = kernel->GetOutputDeclType("Output");
-  CHECK(output_type->precision() == PRECISION(kFloat));
-  CHECK(output_type->layout() == DATALAYOUT(kNCHW));
 
   auto strides = op_info->GetAttr<std::vector<int>>("strides");
   CHECK_EQ(strides.size(), 2L);
@@ -157,9 +148,6 @@ int ConvTransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     if (graph->Has(bias_name)) {
       bias_node = graph->Get(bias_name);
     } else {
-      auto bias_type = kernel->GetInputDeclType("Bias");
-      CHECK(bias_type->precision() == PRECISION(kFloat));
-      CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
       auto bias = scope->FindMutableTensor(bias_name);
       auto channel_size = bias->dims().production();
       CHECK_EQ(channel_size, filter_dims[1] * groups);
diff --git a/lite/kernels/npu/bridges/dropout_op.cc b/lite/kernels/npu/bridges/dropout_op.cc
index 0bb57673281bc3e9dd92fabd6ca5a8e76c76cb73..505a20ee7f2e1f814a414e04b048b0bc0f8d1857 100644
--- a/lite/kernels/npu/bridges/dropout_op.cc
+++ b/lite/kernels/npu/bridges/dropout_op.cc
@@ -32,16 +32,12 @@ int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input, output and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto x_rank = x_dims.size();
   CHECK_GE(x_rank, 2);
 
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
 
   auto dropout_implementation =
       op_info->GetAttr<std::string>("dropout_implementation");
diff --git a/lite/kernels/npu/bridges/elementwise_ops.cc b/lite/kernels/npu/bridges/elementwise_ops.cc
index c6ff56de67ccb7c257c08db343be2e4767938900..276783aeeb6aec3caf04861846c5b8d309f868c6 100644
--- a/lite/kernels/npu/bridges/elementwise_ops.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops.cc
@@ -21,42 +21,78 @@ namespace lite {
 namespace subgraph {
 namespace npu {
 
-void CvtYShape(std::vector<int64_t>* x_shape,
-               std::vector<int64_t>* y_shape,
-               int axis) {
-  CHECK_GE(x_shape->size(), y_shape->size());
+void CvtXYShape(std::vector<int64_t>* x_shape,
+                std::vector<int64_t>* y_shape,
+                int axis) {
+  int x_shape_size = x_shape->size();
+  int y_shape_size = y_shape->size();
+  CHECK_GE(x_shape_size, y_shape_size);
 
-  if (axis < 0) {
-    axis = x_shape->size() - y_shape->size();
+  // only support:
+  // 1. same shape
+  // 2. (n,c,h,w) * (1,c,1,1)
+  // 3. (n,c,h,w) * (n,c,1,1)
+  // 4. (n,c,h,w) * (1,c,h,1)
+  // 5. (n,c,h,w) * (1,c,h,w)
+  // 6. (n,c,h,w) * (n,c,1,w)
+  if (*x_shape == *y_shape) {
+    *x_shape = CvtShape(*x_shape);
+    *y_shape = CvtShape(*y_shape);
+    return;
   }
 
-  // only support:
-  // (n,c,h,w) * (n,c,h,w)
-  // (n,c,h,w) * (1,c,1,1)
-  // (n,c,h,w) * (1,c,h,1)
-  // (n,c,h,w) * (1,c,h,w)
-  int y_shape_size = y_shape->size();
   if (y_shape_size == 1) {
-    y_shape->insert(y_shape->begin(), 1);
-    y_shape->insert(y_shape->end(), 2, 1);
-  } else if (y_shape_size == 2) {
-    y_shape->insert(y_shape->begin(), 1);
-    y_shape->insert(y_shape->end(), 1);
-  } else if (y_shape_size == 3) {
-    y_shape->insert(y_shape->begin(), 1);
+    for (int i = 0; i < 4 - x_shape_size; i++) {
+      x_shape->push_back(1);
+    }
+    int64_t n = x_shape->at(0);
+    int64_t c = x_shape->at(1);
+    int64_t h = x_shape->at(2);
+    int64_t w = x_shape->at(3);
+    if (axis == 0) {
+      *x_shape = std::vector<int64_t>{1, n, c * h * w, 1};
+    } else if (axis == 2) {
+      *x_shape = std::vector<int64_t>{n * c, h, w, 1};
+    } else if (axis == 3) {
+      *x_shape = std::vector<int64_t>{n * c * h, w, 1, 1};
+    }
+    *y_shape = std::vector<int64_t>{1, y_shape->at(0), 1, 1};
+    return;
   }
-  if (y_shape_size < 4) {
-    int n = 1;
-    for (int i = 0; i < axis; i++) {
-      n *= x_shape->at(i);
+
+  if (y_shape_size == 2) {
+    for (int i = 0; i < 4 - x_shape_size; i++) {
+      x_shape->push_back(1);
+    }
+    int64_t n = x_shape->at(0);
+    int64_t c = x_shape->at(1);
+    int64_t h = x_shape->at(2);
+    int64_t w = x_shape->at(3);
+    if (axis == 0) {
+      y_shape->insert(y_shape->end(), 2, 1);
+    } else if (axis == 1) {
+      y_shape->insert(y_shape->begin(), 1);
+      y_shape->insert(y_shape->end(), 1);
+    } else if (axis == 2) {
+      *x_shape = std::vector<int64_t>{n * c, h, w, 1};
+      y_shape->insert(y_shape->begin(), 1);
+      y_shape->insert(y_shape->end(), 1);
     }
-    x_shape->erase(x_shape->begin(), x_shape->begin() + axis);
-    x_shape->insert(x_shape->begin(), n);
-    x_shape->insert(x_shape->end(), 4 - x_shape->size(), 1);
+    return;
   }
 
-  CHECK_EQ(x_shape->size(), 4UL);
-  CHECK_EQ(y_shape->size(), 4UL);
+  if (y_shape_size == 3) {
+    y_shape->insert(y_shape->begin(), 1);
+    int64_t n = x_shape->at(0);
+    int64_t c = x_shape->at(1);
+    int64_t h = x_shape->at(2);
+    int64_t w = x_shape->at(3);
+    if (axis == 0) {
+      *x_shape = std::vector<int64_t>{1, n * c * h, w, 1};
+      *y_shape = std::vector<int64_t>{1, n * c * h, 1, 1};
+    }
+    return;
+  }
 }
 
 int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
@@ -70,45 +106,37 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
-  auto x = scope->FindMutableTensor(x_name);
+  auto x = scope->FindTensor(x_name);
   auto x_dims = x->dims();
 
   auto y_name = op_info->Input("Y").front();
-  auto y_type = kernel->GetInputDeclType("Y");
-  CHECK(y_type->precision() == PRECISION(kFloat));
-  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
-  auto y = scope->FindMutableTensor(y_name);
+  auto y = scope->FindTensor(y_name);
   auto y_dims = y->dims();
 
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
-  auto out = scope->FindMutableTensor(out_name);
+  auto out = scope->FindTensor(out_name);
   auto out_dims = out->dims();
 
   auto axis = op_info->GetAttr<int>("axis");
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
 
   auto x_new_shape = x_dims.Vectorize();
   auto y_new_shape = y_dims.Vectorize();
-  CvtYShape(&x_new_shape, &y_new_shape, axis);
+  CvtXYShape(&x_new_shape, &y_new_shape, axis);
 
   // X node
   std::shared_ptr<Node> x_node = nullptr;
   if (graph->Has(x_name)) {
     x_node = graph->Get(x_name);
-    if (x_dims.Vectorize() != x_new_shape) {
-      auto reshaped_x_node = graph->Add<ge::op::Reshape>(x_name + "/reshape");
-      auto reshaped_x_op = reshaped_x_node->data<ge::op::Reshape>();
-      reshaped_x_op->set_input_tensor(*x_node->data());
-      reshaped_x_op->set_attr_shape(
-          ge::AttrValue::LIST_INT(x_new_shape.begin(), x_new_shape.end()));
-      reshaped_x_op->set_attr_axis(0);
-      x_node = reshaped_x_node;
-    }
+    auto reshaped_x_node = graph->Add<ge::op::Reshape>(x_name + "/reshape");
+    auto reshaped_x_op = reshaped_x_node->data<ge::op::Reshape>();
+    reshaped_x_op->set_input_tensor(*x_node->data());
+    reshaped_x_op->set_attr_shape(
+        ge::AttrValue::LIST_INT(x_new_shape.begin(), x_new_shape.end()));
+    reshaped_x_op->set_attr_axis(0);
+    x_node = reshaped_x_node;
   } else {
     x_node = graph->Add(x_name, *x, x_new_shape);
   }
@@ -117,15 +145,13 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   std::shared_ptr<Node> y_node = nullptr;
   if (graph->Has(y_name)) {
     y_node = graph->Get(y_name);
-    if (y_dims.Vectorize() != y_new_shape) {
-      auto reshaped_y_node = graph->Add<ge::op::Reshape>(y_name + "/reshape");
-      auto reshaped_y_op = reshaped_y_node->data<ge::op::Reshape>();
-      reshaped_y_op->set_input_tensor(*y_node->data());
-      reshaped_y_op->set_attr_shape(
-          ge::AttrValue::LIST_INT(y_new_shape.begin(), y_new_shape.end()));
-      reshaped_y_op->set_attr_axis(0);
-      y_node = reshaped_y_node;
-    }
+    auto reshaped_y_node = graph->Add<ge::op::Reshape>(y_name + "/reshape");
+    auto reshaped_y_op = reshaped_y_node->data<ge::op::Reshape>();
+    reshaped_y_op->set_input_tensor(*y_node->data());
+    reshaped_y_op->set_attr_shape(
+        ge::AttrValue::LIST_INT(y_new_shape.begin(), y_new_shape.end()));
+    reshaped_y_op->set_attr_axis(0);
+    y_node = reshaped_y_node;
   } else {
     y_node = graph->Add(y_name, *y, y_new_shape);
   }
@@ -161,11 +187,11 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     return FAILED;
   }
 
-  if (out_dims.Vectorize() != x_new_shape) {
+  auto out_shape = out_dims.Vectorize();
+  if (out_shape != x_new_shape) {
     auto reshaped_elt_node = graph->Add<ge::op::Reshape>(out_name);
     auto reshaped_elt_op = reshaped_elt_node->data<ge::op::Reshape>();
     reshaped_elt_op->set_input_tensor(*elt_node->data());
-    auto out_shape = out_dims.Vectorize();
     reshaped_elt_op->set_attr_shape(
         ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
     reshaped_elt_op->set_attr_axis(0);
diff --git a/lite/kernels/npu/bridges/engine.cc b/lite/kernels/npu/bridges/engine.cc
index 546a235148420e26d746ff730e22b2170e301cd6..9961d5f17e285350414f1c8ae72fe19d760312de 100644
--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
@@ -56,18 +56,22 @@ int Engine::BuildOriginProgram() {
     } else {
       VLOG(3) << "The attr '" << kKernelTypeAttr
               << "' not found, pick the first kernel for " << op_type;
+      std::vector<std::unique_ptr<KernelBase>> kernels;
 #if defined(LITE_WITH_ARM)
-      auto kernels =
-          op->CreateKernels({Place{TARGET(kARM)}, Place{TARGET(kHost)}});
+      kernels = op->CreateKernels({Place{TARGET(kARM)}, Place{TARGET(kHost)}});
 #elif defined(LITE_WITH_X86)
-      auto kernels =
-          op->CreateKernels({Place{TARGET(kX86)}, Place{TARGET(kHost)}});
+      kernels = op->CreateKernels({Place{TARGET(kX86)}, Place{TARGET(kHost)}});
 #endif
-      CHECK_GT(kernels.size(), 0) << "No kernels found for " << op_type;
-      picked_kernel = std::move(kernels.front());
+      if (kernels.size() > 0) {
+        picked_kernel = std::move(kernels.front());
+      } else {
+        LOG(WARNING) << "No kernels found for " << op_type;
+      }
+    }
+    if (picked_kernel != nullptr) {
+      picked_kernel->SetContext(
+          ContextScheduler::Global().NewContext(picked_kernel->target()));
     }
-    picked_kernel->SetContext(
-        ContextScheduler::Global().NewContext(picked_kernel->target()));
     origin_program_.emplace_back(std::move(op), std::move(picked_kernel));
   }
   return 0;
diff --git a/lite/kernels/npu/bridges/fc_op.cc b/lite/kernels/npu/bridges/fc_op.cc
index d9d42cd8c73a321449649bca658333fdd5f57325..39708dfce7f996de7a281675088fd824bc92236d 100644
--- a/lite/kernels/npu/bridges/fc_op.cc
+++ b/lite/kernels/npu/bridges/fc_op.cc
@@ -31,24 +31,15 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   VLOG(3) << "[NPU] Converting " + op_type + "...";
 
   auto input_name = op_info->Input("Input").front();
-  auto input_type = kernel->GetInputDeclType("Input");
-  CHECK(input_type->precision() == PRECISION(kFloat));
-  CHECK(input_type->layout() == DATALAYOUT(kNCHW));
   auto input = scope->FindTensor(input_name);
   auto input_dims = input->dims();
 
   auto w_name = op_info->Input("W").front();
-  auto w_type = kernel->GetInputDeclType("W");
-  CHECK(w_type->precision() == PRECISION(kFloat));
-  CHECK(w_type->layout() == DATALAYOUT(kNCHW));
   auto w = scope->FindTensor(w_name);
   auto w_dims = w->dims();
   CHECK_EQ(w_dims.size(), 2UL);
 
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto out = scope->FindTensor(out_name);
   auto out_dims = out->dims();
 
@@ -99,9 +90,6 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     if (graph->Has(bias_name)) {
       bias_node = graph->Get(bias_name);
     } else {
-      auto bias_type = kernel->GetInputDeclType("Bias");
-      CHECK(bias_type->precision() == PRECISION(kFloat));
-      CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
       auto bias = scope->FindTensor(bias_name);
       auto bias_dims = bias->dims();
       CHECK_EQ(bias_dims.production(), n);
diff --git a/lite/kernels/npu/bridges/fill_constant_batch_size_like_op.cc b/lite/kernels/npu/bridges/fill_constant_batch_size_like_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2543c404dc865dac65cd1a4433bb1d1a4dae6394
--- /dev/null
+++ b/lite/kernels/npu/bridges/fill_constant_batch_size_like_op.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int FillConstantBatchSizeLikeConverter(void* ctx,
+                                       OpLite* op,
+                                       KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input, output and op attributes
+  auto x_name = op_info->Input("Input").front();
+  auto x = scope->FindTensor(x_name);
+  auto x_dims = x->dims();
+
+  auto out_name = op_info->Output("Out").front();
+  auto out = scope->FindTensor(out_name);
+  auto out_shape = out->dims().Vectorize();
+
+  auto value = op_info->GetAttr<float>("value");
+
+  // dims, value node
+  std::vector<int> target_shape{out_shape.begin(), out_shape.end()};
+  auto dims_node = graph->Add(out_name + "/dims", target_shape);
+
+  auto value_node = graph->Add(out_name + "/value", std::vector<float>{value});
+
+  // Fill node
+  auto fill_node = graph->Add<ge::op::Fill>(out_name);
+  auto fill_op = fill_node->data<ge::op::Fill>();
+  fill_op->set_input_dims(*dims_node->data());
+  fill_op->set_input_value(*value_node->data());
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(
+    fill_constant_batch_size_like,
+    kNPU,
+    paddle::lite::subgraph::npu::FillConstantBatchSizeLikeConverter);
diff --git a/lite/kernels/npu/bridges/fill_constant_op.cc b/lite/kernels/npu/bridges/fill_constant_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d1208fb1b96fd090aec5f87bc086f5dc530d8385
--- /dev/null
+++ b/lite/kernels/npu/bridges/fill_constant_op.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int FillConstantConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input, output and op attributes
+  auto out_name = op_info->Output("Out").front();
+  auto out = scope->FindTensor(out_name);
+  auto out_shape = out->dims().Vectorize();
+
+  auto value = op_info->GetAttr<float>("value");
+
+  // dims & value node
+  std::shared_ptr<Node> dims_node = nullptr;
+  if (!op_info->Input("ShapeTensor").empty()) {
+    auto dims_name = op_info->Input("ShapeTensor").front();
+    dims_node = graph->Get(dims_name);
+  } else {
+    std::vector<int> target_shape{out_shape.begin(), out_shape.end()};
+    dims_node = graph->Add(out_name + "/dims", target_shape);
+  }
+
+  auto value_node = graph->Add(out_name + "/value", std::vector<float>{value});
+
+  // Fill node
+  auto fill_node = graph->Add<ge::op::Fill>(out_name);
+  auto fill_op = fill_node->data<ge::op::Fill>();
+  fill_op->set_input_dims(*dims_node->data());
+  fill_op->set_input_value(*value_node->data());
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(fill_constant,
+                         kNPU,
+                         paddle::lite::subgraph::npu::FillConstantConverter);
diff --git a/lite/kernels/npu/bridges/graph.h b/lite/kernels/npu/bridges/graph.h
index cc4a7e2a7ce062090ca890d90e21aa643e37a0d3..67d8a2b1cc708f7530532840df3e71770b5a3695 100644
--- a/lite/kernels/npu/bridges/graph.h
+++ b/lite/kernels/npu/bridges/graph.h
@@ -87,7 +87,8 @@ class Graph {
     auto idx = Add(name, node);
     CHECK_GE(idx, 1);
     // Generate a unique name for the created HiAI IR
-    node->set_data(std::make_shared<T>(name + "__" + std::to_string(idx)));
+    node->set_data(
+        std::make_shared<T>(name + "__" + paddle::lite::to_string(idx)));
     return node;
   }
 
diff --git a/lite/kernels/npu/bridges/increment_op.cc b/lite/kernels/npu/bridges/increment_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f54860265c202c5000dd77294d2e0d39b2dcdc4
--- /dev/null
+++ b/lite/kernels/npu/bridges/increment_op.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int IncrementConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input, output and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindTensor(x_name);
+  auto x_dims = x->dims();
+
+  auto out_name = op_info->Output("Out").front();
+
+  float step = op_info->GetAttr<float>("step");
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x, CvtShape(x_dims));
+  }
+
+  // Y node
+  Tensor y;
+  y.Resize({1});
+  auto y_data = y.mutable_data<float>();
+  y_data[0] = step;
+  y.set_persistable(true);
+  auto y_node = graph->Add(out_name + "/y", y);
+
+  // add node
+  auto increment_node = graph->Add<ge::op::Add>(out_name);
+  auto increment_op = increment_node->data<ge::op::Add>();
+  increment_op->set_input_x1(*x_node->data());
+  increment_op->set_input_x2(*y_node->data());
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(increment,
+                         kNPU,
+                         paddle::lite::subgraph::npu::IncrementConverter);
diff --git a/lite/kernels/npu/bridges/instance_norm_op.cc b/lite/kernels/npu/bridges/instance_norm_op.cc
index d71d17d8f164edf9daefe19162991726f677ce74..55b6fba7faae44277eda2889a57135b522bbc0a1 100644
--- a/lite/kernels/npu/bridges/instance_norm_op.cc
+++ b/lite/kernels/npu/bridges/instance_norm_op.cc
@@ -32,9 +32,6 @@ int InstanceNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   CHECK_EQ(x_dims.size(), 4L);
@@ -43,9 +40,6 @@ int InstanceNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto spatial_size = x_dims[2] * x_dims[3];
   DDim scale_bias_dims({1, channel_size, 1, 1});
   auto y_name = op_info->Output("Y").front();
-  auto y_type = kernel->GetOutputDeclType("Y");
-  CHECK(y_type->precision() == PRECISION(kFloat));
-  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
   float epsilon = op_info->GetAttr<float>("epsilon");
 
   // X node
@@ -60,9 +54,6 @@ int InstanceNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   std::shared_ptr<Node> bias_node = nullptr;
   if (HasInputArg(op_info, scope, "Bias")) {
     auto bias_name = op_info->Input("Bias").front();
-    auto bias_type = kernel->GetInputDeclType("Bias");
-    CHECK(bias_type->precision() == PRECISION(kFloat));
-    CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
     auto bias = scope->FindMutableTensor(bias_name);
     auto bias_dims = bias->dims();
     CHECK_EQ(channel_size, bias_dims.production());
@@ -100,9 +91,6 @@ int InstanceNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   std::shared_ptr<Node> scale_node = nullptr;
   if (HasInputArg(op_info, scope, "Scale")) {
     auto scale_name = op_info->Input("Scale").front();
-    auto scale_type = kernel->GetInputDeclType("Scale");
-    CHECK(scale_type->precision() == PRECISION(kFloat));
-    CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
     auto scale = scope->FindMutableTensor(scale_name);
     auto scale_dims = scale->dims();
     CHECK_EQ(channel_size, scale_dims.production());
diff --git a/lite/kernels/npu/bridges/interpolate_op.cc b/lite/kernels/npu/bridges/interpolate_op.cc
index d68f63b16e1187e85b1d7c4b69b628376dfa228d..1b931afd7aa1d2e6c70fc304ab044b8c42ec06a7 100644
--- a/lite/kernels/npu/bridges/interpolate_op.cc
+++ b/lite/kernels/npu/bridges/interpolate_op.cc
@@ -32,18 +32,12 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto x_h = x_dims[2];
   auto x_w = x_dims[3];
   CHECK_EQ(x_dims.size(), 4);
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto scale = op_info->GetAttr<float>("scale");
   auto out_w = op_info->GetAttr<int>("out_w");
   auto out_h = op_info->GetAttr<int>("out_h");
@@ -78,9 +72,6 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   std::shared_ptr<Node> out_size_node = nullptr;
   if (HasInputArg(op_info, scope, "OutSize")) {
     auto out_size_name = op_info->Input("OutSize").front();
-    auto out_size_type = kernel->GetInputDeclType("OutSize");
-    CHECK(out_size_type->precision() == PRECISION(kInt32));
-    CHECK(out_size_type->layout() == DATALAYOUT(kNCHW));
     if (graph->Has(out_size_name)) {
       out_size_node = graph->Get(out_size_name);
     } else {
diff --git a/lite/kernels/npu/bridges/layer_norm_op.cc b/lite/kernels/npu/bridges/layer_norm_op.cc
index ad32d69d3c40df49ae155b397803cab65ec43dc9..8c12724a1416a28fbfbde40cccc4b204db0bb154 100644
--- a/lite/kernels/npu/bridges/layer_norm_op.cc
+++ b/lite/kernels/npu/bridges/layer_norm_op.cc
@@ -32,9 +32,6 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto padded_x_shape = CvtShape(x_dims);
@@ -42,9 +39,6 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK(x_rank >= 2 && x_rank <= 4);
 
   auto y_name = op_info->Output("Y").front();
-  auto y_type = kernel->GetOutputDeclType("Y");
-  CHECK(y_type->precision() == PRECISION(kFloat));
-  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
   auto y = scope->FindMutableTensor(y_name);
   auto y_dims = y->dims();
   auto padded_y_shape = CvtShape(y_dims);
@@ -102,9 +96,6 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   std::shared_ptr<Node> bias_node = nullptr;
   if (HasInputArg(op_info, scope, "Bias")) {
     auto bias_name = op_info->Input("Bias").front();
-    auto bias_type = kernel->GetInputDeclType("Bias");
-    CHECK(bias_type->precision() == PRECISION(kFloat));
-    CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
     auto bias = scope->FindMutableTensor(bias_name);
     auto bias_dims = bias->dims();
     CHECK_EQ(bias_dims.size(), 1);
@@ -122,9 +113,6 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   std::shared_ptr<Node> scale_node = nullptr;
   if (HasInputArg(op_info, scope, "Scale")) {
     auto scale_name = op_info->Input("Scale").front();
-    auto scale_type = kernel->GetInputDeclType("Scale");
-    CHECK(scale_type->precision() == PRECISION(kFloat));
-    CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
     auto scale = scope->FindMutableTensor(scale_name);
     auto scale_dims = scale->dims();
     CHECK_EQ(scale_dims.size(), 1);
diff --git a/lite/kernels/npu/bridges/matmul_op.cc b/lite/kernels/npu/bridges/matmul_op.cc
index 4621f5955a841a0ba1b63381cb956242ce69639a..32af1916899454ef7a045339da5e9fc8a6131cfc 100644
--- a/lite/kernels/npu/bridges/matmul_op.cc
+++ b/lite/kernels/npu/bridges/matmul_op.cc
@@ -32,16 +32,10 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindTensor(x_name);
   auto x_dims = x->dims();
 
   auto y_name = op_info->Input("Y").front();
-  auto y_type = kernel->GetInputDeclType("Y");
-  CHECK(y_type->precision() == PRECISION(kFloat));
-  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
   auto y = scope->FindTensor(y_name);
   auto y_dims = y->dims();
 
@@ -62,9 +56,6 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   }
 
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto out = scope->FindTensor(out_name);
   auto out_dims = out->dims();
 
diff --git a/lite/kernels/npu/bridges/mul_op.cc b/lite/kernels/npu/bridges/mul_op.cc
index e7f497bd55bc302448528412f5cfb971001f79ca..140700fdd0a90e032da21187c95ee14e172db30a 100644
--- a/lite/kernels/npu/bridges/mul_op.cc
+++ b/lite/kernels/npu/bridges/mul_op.cc
@@ -33,23 +33,14 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindTensor(x_name);
   auto x_dims = x->dims();
 
   auto y_name = op_info->Input("Y").front();
-  auto y_type = kernel->GetInputDeclType("Y");
-  CHECK(y_type->precision() == PRECISION(kFloat));
-  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
   auto y = scope->FindTensor(y_name);
   auto y_dims = y->dims();
 
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto out = scope->FindTensor(out_name);
   auto out_dims = out->dims();
   if (out_dims.size() > 4) {
diff --git a/lite/kernels/npu/bridges/pad2d_op.cc b/lite/kernels/npu/bridges/pad2d_op.cc
index 372def8a9b2e853c0b17264f9bad960dda6fb295..efcf33af8122f4d148bdcc90e5a071d7d2273192 100644
--- a/lite/kernels/npu/bridges/pad2d_op.cc
+++ b/lite/kernels/npu/bridges/pad2d_op.cc
@@ -32,15 +32,9 @@ int Pad2dConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto padding = op_info->GetAttr<std::vector<int>>("paddings");
   CHECK_EQ(padding.size(), 4);
 
diff --git a/lite/kernels/npu/bridges/paddle_use_bridges.h b/lite/kernels/npu/bridges/paddle_use_bridges.h
index 30d7b79c7e03dfb8176c3bdd098f35eef56a9afd..2b41c36b3c0f7dc0a56049fdb3a154370883836c 100644
--- a/lite/kernels/npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
@@ -18,11 +18,16 @@ USE_SUBGRAPH_BRIDGE(sigmoid, kNPU);
 USE_SUBGRAPH_BRIDGE(relu, kNPU);
 USE_SUBGRAPH_BRIDGE(tanh, kNPU);
 USE_SUBGRAPH_BRIDGE(relu_clipped, kNPU);
+USE_SUBGRAPH_BRIDGE(relu6, kNPU);
 USE_SUBGRAPH_BRIDGE(leaky_relu, kNPU);
 USE_SUBGRAPH_BRIDGE(softsign, kNPU);
 USE_SUBGRAPH_BRIDGE(hard_sigmoid, kNPU);
+USE_SUBGRAPH_BRIDGE(log, kNPU);
+USE_SUBGRAPH_BRIDGE(sqrt, kNPU);
+USE_SUBGRAPH_BRIDGE(square, kNPU);
 
 USE_SUBGRAPH_BRIDGE(batch_norm, kNPU);
+USE_SUBGRAPH_BRIDGE(less_than, kNPU);
 USE_SUBGRAPH_BRIDGE(concat, kNPU);
 USE_SUBGRAPH_BRIDGE(conv2d, kNPU);
 USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kNPU);
@@ -37,10 +42,15 @@ USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kNPU);
 USE_SUBGRAPH_BRIDGE(fusion_elementwise_sub_activation, kNPU);
 USE_SUBGRAPH_BRIDGE(fusion_elementwise_mul_activation, kNPU);
 USE_SUBGRAPH_BRIDGE(fusion_elementwise_div_activation, kNPU);
+USE_SUBGRAPH_BRIDGE(fill_constant, kNPU)
+USE_SUBGRAPH_BRIDGE(fill_constant_batch_size_like, kNPU)
 
+USE_SUBGRAPH_BRIDGE(increment, kNPU);
+USE_SUBGRAPH_BRIDGE(instance_norm, kNPU);
 USE_SUBGRAPH_BRIDGE(fc, kNPU);
 USE_SUBGRAPH_BRIDGE(bilinear_interp, kNPU);
 USE_SUBGRAPH_BRIDGE(nearest_interp, kNPU);
+USE_SUBGRAPH_BRIDGE(layer_norm, kNPU);
 USE_SUBGRAPH_BRIDGE(matmul, kNPU);
 USE_SUBGRAPH_BRIDGE(mul, kNPU);
 USE_SUBGRAPH_BRIDGE(pad2d, kNPU);
@@ -52,11 +62,8 @@ USE_SUBGRAPH_BRIDGE(scale, kNPU);
 USE_SUBGRAPH_BRIDGE(shuffle_channel, kNPU);
 USE_SUBGRAPH_BRIDGE(softmax, kNPU);
 USE_SUBGRAPH_BRIDGE(split, kNPU);
-USE_SUBGRAPH_BRIDGE(sqrt, kNPU);
-USE_SUBGRAPH_BRIDGE(square, kNPU);
+// USE_SUBGRAPH_BRIDGE(top_k, kNPU);
 USE_SUBGRAPH_BRIDGE(transpose, kNPU);
 USE_SUBGRAPH_BRIDGE(transpose2, kNPU);
 USE_SUBGRAPH_BRIDGE(unsqueeze, kNPU);
 USE_SUBGRAPH_BRIDGE(unsqueeze2, kNPU);
-USE_SUBGRAPH_BRIDGE(instance_norm, kNPU);
-USE_SUBGRAPH_BRIDGE(layer_norm, kNPU);
diff --git a/lite/kernels/npu/bridges/pool_op.cc b/lite/kernels/npu/bridges/pool_op.cc
index ee90d81e508dabf58b9c2525ae6cb429aef332a5..51f67a1c6f0122c1140aeb762b448a928bd16692 100644
--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
@@ -33,15 +33,9 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
   auto global_pooling = op_info->GetAttr<bool>("global_pooling");
   auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
@@ -105,10 +99,8 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                  ksize);
 
   // ceil mode
-  int ceil_mode = 0;
-  if (op_info->HasAttr("ceil_mode")) {
-    ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
-  }
+  bool ceil_mode =
+      op_info->HasAttr("ceil_mode") && op_info->GetAttr<bool>("ceil_mode");
 
   // Pooling node
   auto pool_node = graph->Add<ge::op::Pooling>(out_name);
@@ -118,12 +110,14 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   pool_op->set_attr_pad_mode(pad_mode);
   pool_op->set_attr_global_pooling(global_pooling);
   pool_op->set_attr_window(ge::AttrValue::LIST_INT(ksize.begin(), ksize.end()));
-  pool_op->set_attr_pad(ge::AttrValue::LIST_INT{
-      paddings[0], paddings[1], paddings[2], paddings[3]});
+  pool_op->set_attr_pad(
+      ge::AttrValue::LIST_INT(paddings.begin(), paddings.end()));
   pool_op->set_attr_stride(
       ge::AttrValue::LIST_INT(strides.begin(), strides.end()));
-  pool_op->set_attr_ceil_mode(ceil_mode);
-  // pool_op->set_attr_data_mode(data_mode);
+  if (ceil_mode) {
+    pool_op->set_attr_ceil_mode(1);
+    pool_op->set_attr_data_mode(0);
+  }
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
diff --git a/lite/kernels/npu/bridges/reduce_mean_op.cc b/lite/kernels/npu/bridges/reduce_mean_op.cc
index 29f065675c742978638fbbb68c71dd451ca35f37..b2fcd4742989f8d47fce3e3ef643dc32eb5ce5ea 100644
--- a/lite/kernels/npu/bridges/reduce_mean_op.cc
+++ b/lite/kernels/npu/bridges/reduce_mean_op.cc
@@ -32,15 +32,9 @@ int ReduceMeanConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto out_name = op_info->Input("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto keep_dim = op_info->GetAttr<bool>("keep_dim");
   auto dim = op_info->GetAttr<std::vector<int>>("dim");
   CHECK(!dim.empty()) << "[NPU] \"dim\" of reduce_mean should not be empty.";
diff --git a/lite/kernels/npu/bridges/reshape_op.cc b/lite/kernels/npu/bridges/reshape_op.cc
index 00aa4b3497dd0f9bebbfa31b0256250b30b40a30..6b4c62a999e12350bfe7b0cb9f7a0b189ea9e01b 100644
--- a/lite/kernels/npu/bridges/reshape_op.cc
+++ b/lite/kernels/npu/bridges/reshape_op.cc
@@ -33,12 +33,10 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
 
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
 
   // X node
   std::shared_ptr<Node> x_node = nullptr;
@@ -88,6 +86,7 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   } else {
     auto shape = op_info->GetAttr<std::vector<int>>("shape");
     auto out_shape = lite::operators::ValidateShape(shape, x_dims);
+    out_shape = CvtShape(out_shape);
     if (out_shape.size() > 4) {
       LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
                       "but shape has "
diff --git a/lite/kernels/npu/bridges/scale_op.cc b/lite/kernels/npu/bridges/scale_op.cc
index d0139a9e2fd580f3143e9ad9809ed924e6e949a4..e5f8a7b3a442eb3e32f9e1d492f1f333d2c0751f 100644
--- a/lite/kernels/npu/bridges/scale_op.cc
+++ b/lite/kernels/npu/bridges/scale_op.cc
@@ -32,17 +32,11 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input, output and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto x_rank = x_dims.size();
   CHECK_GE(x_rank, 2);
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   // HiAI only support [n, c, 1, 1] for the shape of scale and bias
   std::vector<int64_t> scale_bias_shape = {
       1, x_rank < 3 ? 1 : x_dims[x_rank - 3], 1, 1};
diff --git a/lite/kernels/npu/bridges/scale_op_test.cc b/lite/kernels/npu/bridges/scale_op_test.cc
deleted file mode 100644
index e3a75059030e27f547456c8a3ae85fbab40eb419..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/scale_op_test.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/scale_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-void scale_ref(const std::shared_ptr<operators::ScaleOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  float scale = op_info->GetAttr<float>("scale");
-  float bias = op_info->GetAttr<float>("bias");
-  bool bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
-  if (!bias_after_scale) {
-    bias *= scale;
-  }
-  auto x_data = x->data<float>();
-  auto out_data = out->mutable_data<float>();
-  DDim x_dims = x->dims();
-  DDim out_dims = out->dims();
-  CHECK_EQ(x_dims.production(), out_dims.production());
-  for (int i = 0; i < out_dims.production(); i++) {
-    out_data[i] = x_data[i] * scale + bias;
-  }
-}
-
-void test_scale(int bs,
-                int ic,
-                int ih,
-                int iw,
-                bool bias_after_scale,
-                float scale,
-                float bias) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name("x");
-  std::string out_var_name("out");
-  std::string out_ref_var_name("out_ref");
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float, int>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("scale");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("bias_after_scale", bias_after_scale);
-  opdesc.SetAttr("scale", scale);
-  opdesc.SetAttr("bias", bias);
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ScaleOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor('out')
-  scale_ref(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-TEST(NPUBridges, scale) {
-  for (auto bs : {1, 3}) {
-    for (auto ic : {1, 3}) {
-      for (auto ih : {3, 4}) {
-        for (auto iw : {4, 3}) {
-          for (auto bias_after_scale : {true, false}) {
-            for (auto scale : {-1.0f, 5.0f}) {
-              for (auto bias : {-2.0f, 30.0f}) {
-                VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih
-                        << " iw: " << iw
-                        << " bias_after_scale: " << bias_after_scale
-                        << " scale: " << scale << " bias: " << bias;
-                test_scale(bs, ic, ih, iw, bias_after_scale, scale, bias);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(scale);
-USE_NPU_BRIDGE(scale);
diff --git a/lite/kernels/npu/bridges/shuffle_channel_op.cc b/lite/kernels/npu/bridges/shuffle_channel_op.cc
index 0552bd2382041bde155b661abc053e8680dbcd3e..0ee721186b3a6f5464a569c48ea95c29f0cd1c24 100644
--- a/lite/kernels/npu/bridges/shuffle_channel_op.cc
+++ b/lite/kernels/npu/bridges/shuffle_channel_op.cc
@@ -32,15 +32,9 @@ int ShuffleChannelConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto group = op_info->GetAttr<int>("group");
 
   // X node
diff --git a/lite/kernels/npu/bridges/softmax_op.cc b/lite/kernels/npu/bridges/softmax_op.cc
index 0ca3bc131d1f0910b9282ec53656bee53bbc5444..a6604fbd7879c4517933b1265633f31270095b67 100644
--- a/lite/kernels/npu/bridges/softmax_op.cc
+++ b/lite/kernels/npu/bridges/softmax_op.cc
@@ -32,16 +32,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto x_rank = x_dims.size();
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   int axis = op_info->HasAttr("axis") ? op_info->GetAttr<int>("axis") : -1;
   if (axis < 0) {
     axis += x_rank;
diff --git a/lite/kernels/npu/bridges/split_op.cc b/lite/kernels/npu/bridges/split_op.cc
index 2cdf49fd540bc40ceaaa45df4a6ac65bf94f172a..ef2bdb68fa9988b6a1985a34d22320193256de7b 100644
--- a/lite/kernels/npu/bridges/split_op.cc
+++ b/lite/kernels/npu/bridges/split_op.cc
@@ -32,15 +32,9 @@ int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto out_names = op_info->Output("Out");
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto axis = op_info->GetAttr<int>("axis");
   auto num = op_info->GetAttr<int>("num");
   auto sections = op_info->GetAttr<std::vector<int>>("sections");
@@ -70,10 +64,12 @@ int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   split_op->create_dynamic_output_y(out_names.size());
   int idx = 1;
   for (auto& out_name : out_names) {
-    auto zero_node = graph->Add(out_name + "/zero" + std::to_string(idx), 0);
+    auto zero_node =
+        graph->Add(out_name + "/zero" + paddle::lite::to_string(idx), 0);
     auto add_node = graph->Add<ge::op::Add>(out_name);
     auto add_op = add_node->data<ge::op::Add>();
-    add_op->set_input_x1(*split_node->data(), "y" + std::to_string(idx));
+    add_op->set_input_x1(*split_node->data(),
+                         "y" + paddle::lite::to_string(idx));
     add_op->set_input_x2(*zero_node->data());
     idx++;
   }
diff --git a/lite/kernels/npu/bridges/sqrt_op_test.cc b/lite/kernels/npu/bridges/sqrt_op_test.cc
deleted file mode 100644
index 015d61685b2d99c3df55269442d61b4a137a2ca3..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/sqrt_op_test.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <cmath>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-#include "lite/operators/activation_ops.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-template <typename dtype>
-void sqrt_ref(const std::shared_ptr<operators::ActivationOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-
-  auto x = scope->FindTensor("x");
-  auto out = scope->FindMutableTensor("out_ref");
-  out->Resize(x->dims());
-  auto x_data = x->data<dtype>();
-  auto out_data = out->mutable_data<dtype>();
-
-  for (size_t i = 0; i < x->numel(); i++) {
-    out_data[i] = std::sqrtf(x_data[i]);
-  }
-}
-
-void test_sqrt(const std::vector<int64_t>& input_shape) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.NewTensor(x_var_name);
-  auto* out = scope.NewTensor(out_var_name);
-  auto* out_ref = scope.NewTensor(out_ref_var_name);
-  x->Resize(input_shape);
-
-  // initialize input&output data
-  FillTensor<float>(x, 0, 5);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("sqrt");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-
-  // execute reference implementation and save to output tensor
-  sqrt_ref<float>(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, sqrt) {
-  test_sqrt({2});
-  test_sqrt({2, 3});
-  test_sqrt({1, 2, 3, 4});
-  test_sqrt({5, 6, 7, 8});
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(sqrt);
-USE_NPU_BRIDGE(sqrt);
diff --git a/lite/kernels/npu/bridges/square_op_test.cc b/lite/kernels/npu/bridges/square_op_test.cc
deleted file mode 100644
index d715c11430096a0b6503fbe6047a40c3c29ba8f5..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/square_op_test.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-#include "lite/operators/activation_ops.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-template <typename dtype>
-void square_ref(const std::shared_ptr<operators::ActivationOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-
-  auto x = scope->FindTensor("x");
-  auto out = scope->FindMutableTensor("out_ref");
-  out->Resize(x->dims());
-  auto x_data = x->data<dtype>();
-  auto out_data = out->mutable_data<dtype>();
-
-  for (size_t i = 0; i < x->numel(); i++) {
-    out_data[i] = x_data[i] * x_data[i];
-  }
-}
-
-void test_square(const std::vector<int64_t>& input_shape) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.NewTensor(x_var_name);
-  auto* out = scope.NewTensor(out_var_name);
-  auto* out_ref = scope.NewTensor(out_ref_var_name);
-  x->Resize(input_shape);
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("square");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-
-  // execute reference implementation and save to output tensor
-  square_ref<float>(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, square) {
-  test_square({2});
-  test_square({2, 3});
-  test_square({1, 2, 3, 4});
-  test_square({5, 6, 7, 8});
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(square);
-USE_NPU_BRIDGE(square);
diff --git a/lite/kernels/npu/bridges/sqrt_op.cc b/lite/kernels/npu/bridges/topk_op.cc
similarity index 66%
rename from lite/kernels/npu/bridges/sqrt_op.cc
rename to lite/kernels/npu/bridges/topk_op.cc
index e8fde2272a28823763f096e087be5f024734cf1b..1cc662e054d3c70a21c49ce00bd8f2e836e64883 100644
--- a/lite/kernels/npu/bridges/sqrt_op.cc
+++ b/lite/kernels/npu/bridges/topk_op.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@ namespace lite {
 namespace subgraph {
 namespace npu {
 
-int SqrtConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+int TopkConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK(ctx != nullptr);
   CHECK(op != nullptr);
   auto graph = static_cast<Graph*>(ctx);
@@ -32,15 +32,11 @@ int SqrtConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
-  auto x = scope->FindMutableTensor(x_name);
-  auto x_dims = x->dims();
+  auto x = scope->FindTensor(x_name);
+
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
+
+  int k = op_info->GetAttr<int>("k");
 
   // X node
   std::shared_ptr<Node> x_node = nullptr;
@@ -50,10 +46,16 @@ int SqrtConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     x_node = graph->Add(x_name, *x);
   }
 
-  // Sqrt node
-  auto sqrt_node = graph->Add<ge::op::Sqrt>(out_name);
-  auto sqrt_op = sqrt_node->data<ge::op::Sqrt>();
-  sqrt_op->set_input_x(*x_node->data());
+  // k node
+  std::shared_ptr<Node> k_node = graph->Add<int>(out_name + "/k", k);
+
+  // topk node
+  auto topk_node = graph->Add<ge::op::TopK>(out_name);
+  auto topk_op = topk_node->data<ge::op::TopK>();
+  topk_op->set_input_x(*x_node->data());
+  topk_op->set_input_k(*k_node->data());
+  topk_op->set_attr_format(0);
+
   return SUCCESS;
 }
 
@@ -62,6 +64,6 @@ int SqrtConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(sqrt,
+REGISTER_SUBGRAPH_BRIDGE(top_k,
                          kNPU,
-                         paddle::lite::subgraph::npu::SqrtConverter);
+                         paddle::lite::subgraph::npu::TopkConverter);
diff --git a/lite/kernels/npu/bridges/transpose_op.cc b/lite/kernels/npu/bridges/transpose_op.cc
index 97df11f923d9aad6a49b2251ff985f9dc29ef629..51ca9613fcee646e4c440407dde77405d75f8d4d 100644
--- a/lite/kernels/npu/bridges/transpose_op.cc
+++ b/lite/kernels/npu/bridges/transpose_op.cc
@@ -32,16 +32,10 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
 
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
 
   auto axis = op_info->GetAttr<std::vector<int>>("axis");
 
diff --git a/lite/kernels/npu/bridges/unsqueeze_op.cc b/lite/kernels/npu/bridges/unsqueeze_op.cc
index bcb3bee83be97133cd7eebc7ae69cbc94080d74d..b927ca50b08eb38d7c98aa0e81b25c0296f557f8 100644
--- a/lite/kernels/npu/bridges/unsqueeze_op.cc
+++ b/lite/kernels/npu/bridges/unsqueeze_op.cc
@@ -31,14 +31,10 @@ int UnsqueezeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   VLOG(3) << "[NPU] Converting " << op_type << "... ";
 
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
 
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto out_shape = scope->FindTensor(out_name)->dims().Vectorize();
   CHECK(op_info->HasAttr("axes"))
       << "[NPU] unsqueeze not support axes from tensor now";
diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc
index eaa4e95b3c9933573f5a947b0e6623f33e8d715b..1baa5a0de44d71356cabd505fb0cdfe388a0bae3 100644
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -21,6 +21,7 @@
 #include "lite/core/op_registry.h"
 #include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/paddle_use_bridges.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
@@ -34,7 +35,7 @@ int SubgraphEngine::BuildDeviceProgram() {
   subgraph::npu::Graph graph;
   const auto& bridges = subgraph::Registry::Instance();
   for (auto& inst : origin_program_) {
-    auto op = inst.op();
+    auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
     op->CheckShape();
     op->InferShape();
@@ -43,10 +44,8 @@ int SubgraphEngine::BuildDeviceProgram() {
       return subgraph::FAILED;
     }
     auto kernel = inst.kernel();
-    status |=
-        bridges.Select(op_type, TARGET(kNPU))(reinterpret_cast<void*>(&graph),
-                                              const_cast<OpLite*>(op),
-                                              const_cast<KernelBase*>(kernel));
+    status |= bridges.Select(op_type, TARGET(kNPU))(
+        reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
     if (subgraph::CHECK_FAILED(status)) {
       return subgraph::FAILED;
     }
@@ -84,22 +83,31 @@ int SubgraphEngine::BuildDeviceProgram() {
       << "[NPU] No input nodes found for building NPU model";
   CHECK(!device_onames_.empty())
       << "[NPU] No output nodes found for building NPU model";
+
   // Build the HiAI IR graph to HiAI om model as the device program
-  device_program_ = lite::npu::Device::Global().Build(
+  if (device_program_map_.count(inputs_shape_) > 0) {
+    return status;
+  }
+  auto device_client = lite::npu::Device::Global().Build(
       model_name_, device_inodes, device_onodes);
-  if (device_program_ == nullptr) {
+  if (device_client == nullptr) {
     LOG(WARNING) << "[NPU] Build model failed!";
     return subgraph::FAILED;
   }
+  auto device_program = std::make_shared<device_program_t>(device_client);
+  device_program_map_[inputs_shape_] = device_program;
 
   // Query and check the dimensions of valid input and output tensors
   std::vector<hiai::TensorDimension> device_idims, device_odims;
-  if (device_program_->GetModelIOTensorDim(
+  if (device_program->client->GetModelIOTensorDim(
           model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) {
     LOG(WARNING)
         << "[NPU] Get the dimensions of input and output tensors failed!";
     return subgraph::FAILED;
   }
+  device_program->device_idims = device_idims;
+  device_program->device_odims = device_odims;
+
   CHECK_EQ(device_idims.size(), device_inames_.size());
   CHECK_EQ(device_odims.size(), device_onames_.size());
   origin_idims_.resize(device_inames_.size());
@@ -108,6 +116,7 @@ int SubgraphEngine::BuildDeviceProgram() {
   origin_odims_.resize(device_onames_.size());
   origin_otensors_.resize(device_onames_.size());
   device_otensors_.resize(device_onames_.size());
+
   for (int i = 0; i < device_inames_.size(); i++) {
     auto node = graph.Get(device_inames_[i]);
     auto precision = node->precision();
@@ -129,6 +138,8 @@ int SubgraphEngine::BuildDeviceProgram() {
     device_itensors_[i].reset(new hiai::AiTensor);
     device_itensors_[i]->Init(&(device_idims[i]));
   }
+  device_program->origin_idims = origin_idims_;
+
   for (int i = 0; i < device_onames_.size(); i++) {
     auto node = graph.Get(device_onames_[i]);
     auto precision = node->precision();
@@ -148,6 +159,9 @@ int SubgraphEngine::BuildDeviceProgram() {
       case PRECISION(kFloat):
         origin_otensors_[i]->mutable_data<float>();
         break;
+      case PRECISION(kBool):
+        origin_otensors_[i]->mutable_data<bool>();
+        break;
       case PRECISION(kInt8):
         origin_otensors_[i]->mutable_data<int8_t>();
         break;
@@ -166,6 +180,8 @@ int SubgraphEngine::BuildDeviceProgram() {
                    << PrecisionToStr(precision);
         break;
     }
+    device_program->origin_odims = origin_odims_;
+
     CHECK_EQ(origin_odims_[i].production(),
              device_odims[i].GetNumber() * device_odims[i].GetChannel() *
                  device_odims[i].GetHeight() * device_odims[i].GetWidth());
@@ -177,14 +193,25 @@ int SubgraphEngine::BuildDeviceProgram() {
 
 int SubgraphEngine::LaunchDeviceProgram() {
   // Copy the data of origin input tensors to the buffer of input HiAI tensors
+  // init device_itensors_, device_otensors_, origin_otensors_
+  auto device_program = device_program_map_[inputs_shape_];
   for (size_t i = 0; i < device_itensors_.size(); i++) {
+    device_itensors_[i]->Init(&(device_program->device_idims[i]));
     std::memcpy(device_itensors_[i]->GetBuffer(),
                 origin_itensors_[i]->raw_data(),
                 origin_itensors_[i]->memory_size());
   }
+  for (size_t i = 0; i < device_otensors_.size(); i++) {
+    device_otensors_[i]->Init(&(device_program->device_odims[i]));
+  }
+  for (size_t i = 0; i < origin_otensors_.size(); i++) {
+    origin_otensors_[i]->Resize(device_program->origin_odims[i]);
+  }
+
   // Run the HiAI model by name
   std::string key = "model_name";  // Note: key seems must be model_name
-  model_context_.AddPara(key, model_name_);
+  hiai::AiContext model_context;
+  model_context.AddPara(key, model_name_);
   auto GetCurrentUS = []() -> double {
     struct timeval time;
     gettimeofday(&time, NULL);
@@ -192,11 +219,11 @@ int SubgraphEngine::LaunchDeviceProgram() {
   };
   int istamp;
   auto start_time = GetCurrentUS();
-  CHECK_EQ(
-      device_program_->Process(
-          model_context_, device_itensors_, device_otensors_, 1000, istamp),
-      hiai::AI_SUCCESS);
+  CHECK_EQ(device_program->client->Process(
+               model_context, device_itensors_, device_otensors_, 1000, istamp),
+           hiai::AI_SUCCESS);
   VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
+
   // Copy the data of output HiAI tensor to the buffer of origin output tensors
   for (size_t i = 0; i < device_otensors_.size(); i++) {
     std::memcpy(const_cast<void*>(origin_otensors_[i]->raw_data()),
@@ -206,6 +233,18 @@ int SubgraphEngine::LaunchDeviceProgram() {
   return 0;
 }
 
+bool SubgraphEngine::InputShapeChanged() {
+  std::vector<std::vector<int64_t>> new_shape;
+  for (auto origin_itensor : origin_itensors_) {
+    new_shape.push_back(origin_itensor->dims().Vectorize());
+  }
+  inputs_shape_ = new_shape;
+  if (device_program_map_.count(inputs_shape_) > 0) {
+    return false;
+  }
+  return true;
+}
+
 void SubgraphCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
   engine_.reset(new SubgraphEngine(ctx_.get(),
@@ -230,10 +269,12 @@ void SubgraphCompute::Run() {
 
 REGISTER_LITE_KERNEL(subgraph,
                      kNPU,
-                     kFloat,
+                     kAny,
                      kNCHW,
                      paddle::lite::kernels::npu::SubgraphCompute,
                      def)
-    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Inputs",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindOutput("Outputs",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/npu/subgraph_compute.h b/lite/kernels/npu/subgraph_compute.h
index 2cdc4a0e62fe748a8b1d1dfb8f90c17b1d36e869..801f61b0365c03d59c36e2a62ac3c2bb61f46607 100644
--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -38,20 +39,32 @@ class SubgraphEngine : public subgraph::Engine {
       : subgraph::Engine(
             ctx, block_idx, block_desc, input_names, output_names, scope) {}
 
+  struct device_program_t {
+    explicit device_program_t(std::shared_ptr<hiai::AiModelMngerClient> _client)
+        : client(_client) {}
+    std::shared_ptr<hiai::AiModelMngerClient> client{nullptr};
+    std::vector<DDim> origin_idims{};
+    std::vector<DDim> origin_odims{};
+    std::vector<hiai::TensorDimension> device_idims{};
+    std::vector<hiai::TensorDimension> device_odims{};
+  };
+
  protected:
   int BuildDeviceProgram() override;
   int LaunchDeviceProgram() override;
+  bool InputShapeChanged() override;
 
-  std::string model_name_;
-  hiai::AiContext model_context_;
-  std::vector<std::string> device_inames_;
-  std::vector<std::string> device_onames_;
-  std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_;
-  std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_;
-  std::unique_ptr<hiai::AiModelMngerClient> device_program_{nullptr};
+  std::string model_name_{"model.om"};
+  std::vector<std::vector<int64_t>> inputs_shape_{};
+  std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<device_program_t>>
+      device_program_map_{};
+  std::vector<std::string> device_inames_{};
+  std::vector<std::string> device_onames_{};
+  std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_{};
+  std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_{};
 };
 
-class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kFloat)> {
+class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kAny)> {
  public:
   using param_t = operators::SubgraphParam;
 
diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt
index f87b37fc62343b00aedd92fc7c30de3ea42c3c9d..8b1a1f8d3d950840ce8fadef70150c452b54c186 100644
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -1,94 +1,153 @@
-if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL))
+if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_PYTHON) AND (NOT LITE_WITH_OPENCL))
     return ()
 endif()
 
 set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper cl_image_converter)
 
-add_kernel(fc_opencl OPENCL basic SRCS fc_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(mul_opencl OPENCL basic SRCS mul_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(elementwise_mul_opencl OPENCL basic SRCS elementwise_mul_compute.cc DEPS ${cl_kernel_deps})
+#####################
+# image kernel      #
+#####################
+# basic
+add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(elementwise_sub_opencl OPENCL basic SRCS elementwise_sub_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(elementwise_mul_opencl OPENCL basic SRCS elementwise_mul_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(fusion_elementwise_add_activation_opencl
-           OPENCL basic SRCS fusion_elementwise_add_activation_compute.cc
+           OPENCL basic SRCS fusion_elementwise_add_activation_image_compute.cc
            DEPS elementwise_add_opencl ${cl_kernel_deps})
-add_kernel(pool_opencl OPENCL basic SRCS pool_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
-add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(sigmoid_opencl OPENCL basic SRCS sigmoid_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps} cl_image_converter)
-add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(concat_opencl OPENCL basic SRCS concat_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(scale_opencl OPENCL basic SRCS scale_compute.cc DEPS ${cl_kernel_deps})
-
-lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc
-             DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_elementwise_mul_opencl SRCS elementwise_mul_compute_test.cc
-             DEPS elementwise_mul_opencl op_registry program context
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_pool_opencl SRCS pool_compute_test.cc
-             DEPS pool_opencl op_registry program context
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_fc_opencl SRCS fc_compute_test.cc
-             DEPS fc_opencl op_registry program context
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-# TODO(ysh329): comment for buffer-impl mul
-#lite_cc_test(test_mul_opencl SRCS mul_compute_test.cc
-#        DEPS mul_opencl op_registry program context
-#        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_io_copy_compute_opencl SRCS io_copy_compute_test.cc
-             DEPS io_copy_compute_opencl op_registry program context
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-#TODO(ysh329): comment buffer-impl relu
-lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc
-             DEPS relu_opencl layout_opencl op_registry program context
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_sigmoid_opencl SRCS sigmoid_compute_test.cc
-        DEPS sigmoid_opencl layout_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
-             DEPS depthwise_conv2d_opencl op_registry program context
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_depthwise_conv2d_image2d_opencl SRCS depthwise_conv2d_image2d_compute_test.cc
-             DEPS conv_opencl op_registry program context
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_reshape_opencl SRCS reshape_compute_test.cc
-             DEPS reshape_opencl op_registry program context
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc
-             DEPS conv_opencl op_registry program context
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_conv_image2d_opencl SRCS conv_image2d_compute_test.cc
-        DEPS conv_opencl op_registry program context cl_image_converter
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
-        DEPS layout_opencl op_registry program context cl_image_converter
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_concat_opencl SRCS concat_compute_test.cc
-        DEPS concat_opencl layout_opencl op_registry program context
-         ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-         
-lite_cc_test(test_nearest_interp_opencl SRCS nearest_interp_compute_test.cc
-        DEPS nearest_interp_opencl layout_opencl op_registry program context cl_image_converter
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
-
-lite_cc_test(test_scale_opencl SRCS scale_compute_test.cc
-             DEPS scale_opencl op_registry program context
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+add_kernel(fusion_elementwise_sub_activation_opencl
+           OPENCL basic SRCS fusion_elementwise_sub_activation_image_compute.cc
+           DEPS elementwise_sub_opencl ${cl_kernel_deps})
+
+add_kernel(pool_opencl OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(activation_opencl OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(reshape_opencl OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(conv_opencl OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(layout_opencl OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(scale_opencl OPENCL basic SRCS scale_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(grid_sampler_opencl OPENCL basic SRCS grid_sampler_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(lrn_opencl OPENCL basic SRCS lrn_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(bilinear_interp_opencl OPENCL basic SRCS bilinear_interp_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(slice_opencl OPENCL basic SRCS slice_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(instance_norm_opencl OPENCL basic SRCS instance_norm_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(dropout_opencl OPENCL basic SRCS dropout_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(pad2d_opencl OPENCL basic SRCS pad2d_image_compute.cc DEPS ${cl_kernel_deps})
+
+# extra
+# wait to add ...
+
+
+
+
+######################
+# image kernel test  #
+######################
+lite_cc_test(test_activation_image_opencl SRCS activation_image_compute_test.cc
+             DEPS activation_opencl layout_opencl op_registry program context)
+
+lite_cc_test(test_conv_image_opencl SRCS conv_image_compute_test.cc
+             DEPS conv_opencl op_registry program context)
+
+lite_cc_test(test_depthwise_conv2d_image_opencl SRCS depthwise_conv2d_image_compute_test.cc
+             DEPS conv_opencl op_registry program context)
+
+lite_cc_test(test_nearest_interp_image_opencl SRCS nearest_interp_image_compute_test.cc
+             DEPS nearest_interp_opencl layout_opencl op_registry program context)
+
+lite_cc_test(test_pool_image_opencl SRCS pool_image_compute_test.cc
+             DEPS pool_opencl op_registry program context)
+
+lite_cc_test(test_scale_image_opencl SRCS scale_image_compute_test.cc
+             DEPS scale_opencl op_registry program context)
+
+lite_cc_test(test_reshape_image_opencl SRCS reshape_image_compute_test.cc
+             DEPS reshape_opencl op_registry program context)
+
+lite_cc_test(test_concat_image_opencl SRCS concat_image_compute_test.cc
+             DEPS concat_opencl layout_opencl op_registry program context)
+
+#lite_cc_test(test_elementwise_mul_image_opencl SRCS elementwise_mul_image_compute_test.cc
+#             DEPS elementwise_mul_opencl op_registry program context)
+
+lite_cc_test(test_layout_image_opencl SRCS layout_image_compute_test.cc
+             DEPS layout_opencl op_registry program context)
+
+lite_cc_test(test_elementwise_add_image_opencl SRCS elementwise_add_image_compute_test.cc
+             DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context)
+lite_cc_test(test_elementwise_sub_image_opencl SRCS elementwise_sub_image_compute_test.cc
+             DEPS elementwise_sub_opencl fusion_elementwise_sub_activation_opencl op_registry program context)
+
+lite_cc_test(test_grid_sampler_image_opencl SRCS grid_sampler_image_compute_test.cc
+             DEPS grid_sampler_opencl op_registry program context)
+
+lite_cc_test(test_lrn_image_opencl SRCS lrn_image_compute_test.cc
+             DEPS lrn_opencl op_registry program context)
+             
+lite_cc_test(test_bilinear_interp_image_opencl SRCS bilinear_interp_image_compute_test.cc
+	         DEPS bilinear_interp_opencl op_registry program context)
+lite_cc_test(test_slice_image_opencl SRCS slice_image_compute_test.cc
+	         DEPS slice_opencl op_registry program context)
+             
+	     #lite_cc_test(test_instance_norm_image_opencl SRCS instance_norm_image_compute_test.cc
+	     #             DEPS instance_norm_opencl op_registry program context)
+
+lite_cc_test(test_dropout_image_opencl SRCS dropout_image_compute_test.cc
+                 DEPS dropout_opencl op_registry program context)  
+                 
+lite_cc_test(test_pad2d_image_opencl SRCS pad2d_image_compute_test.cc
+                 DEPS pad2d_opencl layout_opencl op_registry program context)
+######################
+# buffer kernel      #
+######################
+# basic
+#add_kernel(activation_opencl OPENCL basic SRCS activation_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(conv_opencl OPENCL basic SRCS conv_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(pool_opencl OPENCL basic SRCS pool_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(concat_opencl OPENCL basic SRCS concat_buffer_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(fc_opencl OPENCL basic SRCS fc_buffer_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(mul_opencl OPENCL basic SRCS mul_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_buffer_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(fusion_elementwise_add_activation_opencl
+#           OPENCL basic SRCS fusion_elementwise_add_activation_buffer_compute.cc
+#           DEPS elementwise_add_opencl ${cl_kernel_deps})
+add_kernel(io_copy_opencl OPENCL basic SRCS io_copy_buffer_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
+
+# extra
+# wait to add ...
+
+
+
+######################
+# buffer kernel test #
+######################
+#lite_cc_test(test_activation_buffer_opencl SRCS activation_buffer_compute_test.cc
+#             DEPS activation_opencl op_registry program context)
+
+#lite_cc_test(test_conv_buffer_opencl SRCS conv_buffer_compute_test.cc
+#             DEPS conv_opencl op_registry program context)
+
+#lite_cc_test(test_im2col_buffer_opencl SRCS im2col_buffer_test.cc
+#             DEPS tensor cl_context cl_wrapper cl_target_wrapper)
+
+#lite_cc_test(test_depthwise_conv2d_buffer_opencl SRCS depthwise_conv2d_buffer_compute_test.cc
+#             DEPS depthwise_conv2d_opencl op_registry program context)
+
+#lite_cc_test(test_pool_buffer_opencl SRCS pool_buffer_compute_test.cc
+#             DEPS pool_opencl op_registry program context)
+
+#lite_cc_test(test_concat_buffer_opencl SRCS concat_buffer_compute_test.cc
+#             DEPS concat_opencl op_registry program context)
+
+lite_cc_test(test_fc_buffer_opencl SRCS fc_buffer_compute_test.cc
+             DEPS fc_opencl op_registry program context)
+
+lite_cc_test(test_mul_buffer_opencl SRCS mul_buffer_compute_test.cc
+             DEPS mul_opencl op_registry program context)
+
+#lite_cc_test(test_elementwise_add_buffer_opencl SRCS elementwise_add__buffer_compute_test.cc
+#             DEPS elementwise_add_opencl op_registry program context)
+
+lite_cc_test(test_io_copy_buffer_opencl SRCS io_copy_buffer_compute_test.cc
+             DEPS io_copy_opencl op_registry program context)
diff --git a/lite/kernels/opencl/activation_buffer_compute.cc b/lite/kernels/opencl/activation_buffer_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c662aa89fb257aded70119ea14494111398f0529
--- /dev/null
+++ b/lite/kernels/opencl/activation_buffer_compute.cc
@@ -0,0 +1,160 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class ReluCompute
+    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  std::string doc() const override { return "Relu using cl::Buffer, kFloat"; }
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "buffer/relu_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    size_t count = x_dims.production();
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    auto* x_buf = param.X->data<float, cl::Buffer>();
+    auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, (const int)count);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+
+    auto global_work_size = cl::NDRange{count};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_buf, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"relu"};
+  std::string build_options_{"-DCL_DTYPE_float -DRELU"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+class SigmoidCompute
+    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  std::string doc() const override {
+    return "Sigmoid using cl::Buffer, kFloat";
+  }
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "buffer/sigmoid_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    size_t count = x_dims.production();
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    auto* x_buf = param.X->data<float, cl::Buffer>();
+    auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, (const int)count);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+
+    auto global_work_size = cl::NDRange{count};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_buf, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"sigmoid"};
+  std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+// Relu
+REGISTER_LITE_KERNEL(relu,
+                     kOpenCL,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::opencl::ReluCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .Finalize();
+
+// Sigmoid
+REGISTER_LITE_KERNEL(sigmoid,
+                     kOpenCL,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::opencl::SigmoidCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .Finalize()
diff --git a/lite/kernels/opencl/activation_buffer_compute_test.cc b/lite/kernels/opencl/activation_buffer_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e0a49c349a96a0afab0b82d23f4aa04d5db6baa8
--- /dev/null
+++ b/lite/kernels/opencl/activation_buffer_compute_test.cc
@@ -0,0 +1,180 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename dtype>
+void relu_compute_ref(const dtype *x_data,
+                      const DDim &x_dim,
+                      dtype *out_data,
+                      float threshold = 0.f) {
+  if (abs(threshold) < 1e-5) {
+    // relu
+    for (int i = 0; i < x_dim.production(); ++i) {
+      out_data[i] = (x_data[i] > threshold) ? x_data[i] : threshold;
+    }
+  } else {
+    // relu6 or relu with threshold
+    for (int i = 0; i < x_dim.production(); ++i) {
+      auto out_tmp = (x_data[i] > 0) ? x_data[i] : 0;
+      out_data[i] = (out_tmp < threshold) ? out_tmp : threshold;
+    }
+  }
+}
+
+template <typename dtype>
+void sigmoid_compute_ref(const dtype *x_data,
+                         const DDim &x_dim,
+                         dtype *out_data) {
+  for (int i = 0; i < x_dim.production(); ++i) {
+    out_data[i] = 1 / (1 + expf(-x_data[i]));
+  }
+}
+
+TEST(opencl_relu_buffer, compute) {
+  // prepare data
+  const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
+  lite::Tensor x, out;
+  x.Resize(x_dim);
+  out.Resize(x_dim);
+
+  auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-10, 10);
+  auto *mapped_x = static_cast<float *>(
+      TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
+  for (int i = 0; i < x_dim.production(); i++) {
+    mapped_x[i] = dist(engine);
+  }
+
+  // set param and kernel, then run
+  operators::ActivationParam param;
+  param.X = &x;
+  param.Out = &out;
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+  auto kernels = KernelRegistry::Global().Create(
+      "relu", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> relu_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(relu_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(relu_context));
+
+  kernel->Launch();
+
+  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto *out_ptr = param.Out->data<float, cl::Buffer>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto &event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  // run compute ref and check
+  std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
+  relu_compute_ref<float>(mapped_x, x_dim, out_ref.get());
+
+  auto *out_data = out.mutable_data<float, cl::Buffer>();
+  auto *mapped_out = static_cast<float *>(
+      TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
+  for (int i = 0; i < x_dim.production(); i++) {
+    EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
+  }
+  TargetWrapperCL::Unmap(out_data, mapped_out);
+  TargetWrapperCL::Unmap(x_data, mapped_x);
+}
+
+TEST(opencl_sigmoid_buffer, compute) {
+  // prepare data
+  const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
+  lite::Tensor x, out;
+  x.Resize(x_dim);
+  out.Resize(x_dim);
+
+  auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-10, 10);
+  auto *mapped_x = static_cast<float *>(
+      TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
+  for (int i = 0; i < x_dim.production(); i++) {
+    mapped_x[i] = dist(engine);
+  }
+
+  // set param and kernel, then run
+  operators::ActivationParam param;
+  param.X = &x;
+  param.Out = &out;
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+  auto kernels = KernelRegistry::Global().Create(
+      "sigmoid", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> sigmoid_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(sigmoid_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(sigmoid_context));
+
+  kernel->Launch();
+
+  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto *out_ptr = param.Out->data<float, cl::Buffer>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto &event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  // run compute ref and check
+  std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
+  sigmoid_compute_ref<float>(mapped_x, x_dim, out_ref.get());
+
+  auto *out_data = out.mutable_data<float, cl::Buffer>();
+  auto *mapped_out = static_cast<float *>(
+      TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
+  for (int i = 0; i < x_dim.production(); i++) {
+    EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
+  }
+  TargetWrapperCL::Unmap(out_data, mapped_out);
+  TargetWrapperCL::Unmap(x_data, mapped_x);
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+// sigmoid buffer
+USE_LITE_KERNEL(sigmoid, kOpenCL, kFloat, kNCHW, def);
+
+// relu buffer
+USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def);
diff --git a/lite/kernels/opencl/activation_image_compute.cc b/lite/kernels/opencl/activation_image_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dbe487ba91d00c2de4c08edf140526d727bac6b5
--- /dev/null
+++ b/lite/kernels/opencl/activation_image_compute.cc
@@ -0,0 +1,268 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class ActivationComputeImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  std::string doc() const override {
+    return "Activation using cl::Image2D(ImageDefault/RGBA), kFP16";
+  }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    act_param_ = param_.get_mutable<param_t>();
+    int act_type = static_cast<int>(act_param_->active_type);
+    VLOG(1) << "ActivationTypeToStr(act_param_->active_type):"
+            << ActivationTypeToStr(act_param_->active_type);
+    switch (act_type) {
+      case 1:
+        kernel_func_name_ = "relu";
+        break;
+      case 2:
+        kernel_func_name_ = "relu6";
+        threshold_ = act_param_->Relu_clipped_coef;
+        break;
+      case 4:
+        kernel_func_name_ = "leaky_relu";
+        scale_ = act_param_->Leaky_relu_alpha;
+        break;
+      case 5:
+        kernel_func_name_ = "sigmoid";
+        break;
+      case 6:
+        kernel_func_name_ = "tanh_act";
+        break;
+      case 7:
+        kernel_func_name_ = "swish";
+        scale_ = act_param_->Swish_beta;
+        break;
+      case 8:
+        kernel_func_name_ = "exp_act";
+        break;
+      default:
+        LOG(FATAL) << "This act type:" << act_type << " doesn't support.";
+        return;
+    }
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/activation_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    auto* x_img = param.X->data<half_t, cl::Image2D>();
+    auto image_shape = InitImageDimInfoWith(x_dims);
+    auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(
+        image_shape["width"], image_shape["height"]);
+    const auto& y_dims = param.Out->dims();  // useless: check dim only
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, threshold_);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, scale_);
+    CL_CHECK_FATAL(status);
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+            << image_shape["height"];
+    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
+    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+    VLOG(4) << "threshold:" << threshold_;
+    VLOG(4) << "scale:" << scale_;
+    VLOG(4) << "kernel func name:" << kernel_func_name_;
+#endif
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
+                    static_cast<cl::size_type>(image_shape["height"])};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+  }
+
+ private:
+  param_t* act_param_{nullptr};
+  std::string kernel_func_name_{};
+  float threshold_{6.f};
+  float scale_{1.f};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+// leakyRelu
+REGISTER_LITE_KERNEL(
+    leaky_relu,
+    kOpenCL,
+    kFP16,
+    kImageDefault,
+    paddle::lite::kernels::opencl::ActivationComputeImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+// swish
+REGISTER_LITE_KERNEL(
+    swish,
+    kOpenCL,
+    kFP16,
+    kImageDefault,
+    paddle::lite::kernels::opencl::ActivationComputeImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+// exp
+REGISTER_LITE_KERNEL(
+    exp,
+    kOpenCL,
+    kFP16,
+    kImageDefault,
+    paddle::lite::kernels::opencl::ActivationComputeImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+// tanh
+REGISTER_LITE_KERNEL(
+    tanh,
+    kOpenCL,
+    kFP16,
+    kImageDefault,
+    paddle::lite::kernels::opencl::ActivationComputeImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+// Relu
+REGISTER_LITE_KERNEL(
+    relu,
+    kOpenCL,
+    kFP16,
+    kImageDefault,
+    paddle::lite::kernels::opencl::ActivationComputeImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+// Relu6
+REGISTER_LITE_KERNEL(
+    relu6,
+    kOpenCL,
+    kFP16,
+    kImageDefault,
+    paddle::lite::kernels::opencl::ActivationComputeImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+// Sigmoid
+REGISTER_LITE_KERNEL(
+    sigmoid,
+    kOpenCL,
+    kFP16,
+    kImageDefault,
+    paddle::lite::kernels::opencl::ActivationComputeImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/activation_image_compute_test.cc b/lite/kernels/opencl/activation_image_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2f30ec6743fd488fc88f0b9f9d6544b3ca7642bf
--- /dev/null
+++ b/lite/kernels/opencl/activation_image_compute_test.cc
@@ -0,0 +1,328 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/kernels/opencl/test_helper.h"
+
+#define FP16_MAX_DIFF (1e0)
+
+namespace paddle {
+namespace lite {
+
+template <typename dtype>
+void act_compute_ref(const dtype *x_data,
+                     const DDim &x_dim,
+                     dtype *out_data,
+                     int act_type,
+                     float threshold,
+                     float scale) {
+  for (int i = 0; i < x_dim.production(); i++) {
+    switch (act_type) {
+      case 1:  // relu
+        out_data[i] = x_data[i] > 0 ? x_data[i] : 0;
+        break;
+      case 2:  // relu6
+        out_data[i] = x_data[i] > 0 ? x_data[i] : 0;
+        out_data[i] = (out_data[i] < threshold) ? out_data[i] : threshold;
+        break;
+      case 4:  // leakyRelu
+        out_data[i] = x_data[i] > 0 ? x_data[i] : x_data[i] * scale;
+        break;
+      case 5:  // sigmoid
+        out_data[i] = 1 / (1 + expf(-x_data[i]));
+        break;
+      case 6:  // tanh
+        out_data[i] = (expf(x_data[i]) - expf(-x_data[i])) /
+                      (expf(x_data[i]) + expf(-x_data[i]));
+        break;
+      case 7:  // swish
+        out_data[i] = x_data[i] / (1 + expf(-x_data[i] * scale));
+        break;
+      case 8:  // exp
+        out_data[i] = expf(x_data[i]);
+        break;
+      default:
+        break;
+    }
+  }
+}
+
+//  #define ACT_FP16_LOOP_TEST
+// #define ACT_FP16_PRINT_RESULT
+TEST(act_image2d_fp16, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu(img) -> "
+               "layout(img2buf) "
+               "-> host";
+
+#ifdef ACT_FP16_LOOP_TEST
+  for (int n = 1; n <= 100; n += 33) {
+    for (auto c : {1, 3, 8, 23, 32}) {
+      for (int h = 12; h <= 100; h += 13) {
+        for (int w = 12; w <= 100; w += 25) {
+          for (auto act_type : {1, 2, 4, 5, 6, 7, 8}) {
+            for (auto scale : {0.5, 0.8}) {
+              for (auto threshold : {6.0}) {
+#else
+  const int n = 1;
+  const int c = 2;
+  const int h = 3;
+  const int w = 4;
+  const int act_type = 4;
+  const float scale = 0.5f;
+  const float threshold = 6.f;
+
+#endif  // ACT_FP16_LOOP_TEST
+
+                LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c
+                          << " " << h << " " << w << " ========";
+                LOG(INFO) << "====act_type: " << act_type
+                          << ", scale: " << scale
+                          << ", threshold: " << threshold;
+                std::string func_name = "relu";
+                switch (act_type) {
+                  case 1:  // relu
+                    func_name = "relu";
+                    break;
+                  case 2:  // relu6
+                    func_name = "relu6";
+                    break;
+                  case 4:  // leaky_relu
+                    func_name = "leaky_relu";
+                    break;
+                  case 5:  // sigmoid
+                    func_name = "sigmoid";
+                    break;
+                  case 6:  // tanh
+                    func_name = "tanh";
+                    break;
+                  case 7:  // tanh
+                    func_name = "swish";
+                    break;
+                  case 8:  // tanh
+                    func_name = "exp";
+                    break;
+                }
+                LOG(INFO) << "func_name: " << func_name;
+                // set layout kernels
+                auto buf_to_img_kernels =
+                    KernelRegistry::Global().Create("layout",
+                                                    TARGET(kOpenCL),
+                                                    PRECISION(kAny),
+                                                    DATALAYOUT(kImageDefault));
+                auto img_to_buf_kernels =
+                    KernelRegistry::Global().Create("layout",
+                                                    TARGET(kOpenCL),
+                                                    PRECISION(kAny),
+                                                    DATALAYOUT(kNCHW));
+                auto act_img_kernels =
+                    KernelRegistry::Global().Create(func_name.c_str(),
+                                                    TARGET(kOpenCL),
+                                                    PRECISION(kFP16),
+                                                    DATALAYOUT(kImageDefault));
+                ASSERT_FALSE(buf_to_img_kernels.empty());
+                ASSERT_FALSE(buf_to_img_kernels.empty());
+                ASSERT_FALSE(act_img_kernels.empty());
+
+                auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+                auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+                auto act_img_kernel = std::move(act_img_kernels.front());
+                LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+                LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+                LOG(INFO) << "get 3rd kernel: " << act_img_kernel->doc();
+
+                // set tensors about op param
+                LOG(INFO) << "set tensors about op param";
+                // layout(buf->img): x -> act_in
+                // relu(img): act_in -> act_out
+                // layout(img->buf): act_out -> y
+                lite::Tensor x, y, act_in, act_out, y_ref;
+                operators::LayoutParam BufferToImageParam;
+                operators::LayoutParam ImageToBufferParam;
+                BufferToImageParam.x = &x;
+                BufferToImageParam.y = &act_in;
+                ImageToBufferParam.x = &act_out;
+                ImageToBufferParam.y = &y;
+                operators::ActivationParam actParam;
+                actParam.X = &act_in;
+                actParam.Out = &act_out;
+                actParam.active_type =
+                    (paddle::lite_api::ActivationType)act_type;
+                actParam.Relu_clipped_coef = threshold;
+                actParam.Leaky_relu_alpha = scale;
+                actParam.Swish_beta = scale;
+
+                const DDim x_dim =
+                    DDim(std::vector<DDim::value_type>{n, c, h, w});
+                x.Resize(x_dim);
+                y.Resize(x_dim);
+                act_in.Resize(x_dim);
+                act_out.Resize(x_dim);
+                y_ref.Resize(x_dim);
+                auto act_image2d_shape =
+                    paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
+
+                // initialize tensors
+                LOG(INFO) << "initialize tensors";
+                auto *x_data =
+                    x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+                auto *y_data =
+                    y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+                auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+                auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
+                    x_data, 0, sizeof(float) * x_dim.production()));
+                auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
+                    y_data, 0, sizeof(float) * x_dim.production()));
+                std::default_random_engine engine;
+                std::uniform_real_distribution<float> dist(-1, 1);
+                for (int i = 0; i < x_dim.production(); ++i) {
+                  mapped_x[i] = dist(engine);
+                  mapped_y[i] = 0.0f;
+                }
+                auto *act_in_data = act_in.mutable_data<half_t, cl::Image2D>(
+                    act_image2d_shape["width"], act_image2d_shape["height"]);
+                auto *act_out_data = act_out.mutable_data<half_t, cl::Image2D>(
+                    act_image2d_shape["width"], act_image2d_shape["height"]);
+
+                // set context and kernel args
+                LOG(INFO) << "set context and kernel args";
+                std::unique_ptr<KernelContext> context(new KernelContext);
+                context->As<OpenCLContext>().InitOnce();
+
+                buf_to_img_kernel->SetParam(BufferToImageParam);
+                std::unique_ptr<KernelContext> buf_to_img_context(
+                    new KernelContext);
+                context->As<OpenCLContext>().CopySharedTo(
+                    &(buf_to_img_context->As<OpenCLContext>()));
+                buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+
+                img_to_buf_kernel->SetParam(ImageToBufferParam);
+                std::unique_ptr<KernelContext> img_to_buf_context(
+                    new KernelContext);
+                context->As<OpenCLContext>().CopySharedTo(
+                    &(img_to_buf_context->As<OpenCLContext>()));
+                img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+
+                act_img_kernel->SetParam(actParam);
+                std::unique_ptr<KernelContext> act_img_context(
+                    new KernelContext);
+                context->As<OpenCLContext>().CopySharedTo(
+                    &(act_img_context->As<OpenCLContext>()));
+                act_img_kernel->SetContext(std::move(act_img_context));
+
+                // run kernels
+                LOG(INFO) << "run kernel: buf_to_img_kernel";
+                buf_to_img_kernel->Launch();
+                LOG(INFO) << "run kernel: act_img_kernel";
+                act_img_kernel->Launch();
+                LOG(INFO) << "run kernel: img_to_buf_kernel";
+                img_to_buf_kernel->Launch();
+
+                // wait for opencl
+                auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+                auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
+                auto it = wait_list->find(out_ptr);
+
+                if (it != wait_list->end()) {
+                  VLOG(4) << "--- Find the sync event for the target cl "
+                             "tensor. ---";
+                  auto &event = *(it->second);
+                  event.wait();
+                } else {
+                  LOG(FATAL) << "Could not find the sync event for the target "
+                                "cl tensor.";
+                }
+
+                // compute ref cpu
+                act_compute_ref<float>(
+                    mapped_x, x_dim, y_data_ref, act_type, threshold, scale);
+// result
+#ifdef ACT_FP16_PRINT_RESULT
+                LOG(INFO) << "---- print kernel result (input -> output) ----";
+                for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+                  std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
+                            << ", ref: " << y_data_ref[eidx] << std::endl;
+                }
+#endif  // ACT_FP16_PRINT_RESULT
+
+                // check result: compare kernel output and cpu
+                // output(y_data_ref)
+                for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+                  auto abs_diff =
+                      COMPUTE_ABS_DIFF(y_data_ref[eidx], mapped_y[eidx]);
+                  auto relative_diff =
+                      COMPUTE_RELATIVE_DIFF(y_data_ref[eidx], mapped_y[eidx]);
+                  // EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
+                  //              (abs_diff <= FP16_MAX_DIFF),
+                  //          true);
+                  if ((relative_diff > FP16_MAX_DIFF) &&
+                      (abs_diff > FP16_MAX_DIFF)) {
+                    LOG(ERROR)
+                        << "error idx:" << eidx << ", y_data_ref[" << eidx
+                        << "]:" << y_data_ref[eidx] << ", mapped_y[" << eidx
+                        << "]:" << mapped_y[eidx] << " mapped_x[" << eidx
+                        << "]:" << mapped_x[eidx] << " abs_diff:" << abs_diff
+                        << " relative_diff:" << relative_diff
+                        << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+                    return;
+                  }
+                }
+
+                // free
+                LOG(INFO) << "free: unmap x, y";
+                TargetWrapperCL::Unmap(x_data, mapped_x);
+                TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef ACT_FP16_LOOP_TEST
+              }  // threshold
+            }    // scale
+          }      // act_type
+        }        // w
+      }          // h
+    }            // c
+  }              // n
+#else
+// nothing to do.
+#endif
+}
+}  // namespace lite
+}  // namespace paddle
+
+// layout
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
+
+// exp
+USE_LITE_KERNEL(exp, kOpenCL, kFP16, kImageDefault, ImageDefault);
+
+// swish
+USE_LITE_KERNEL(swish, kOpenCL, kFP16, kImageDefault, ImageDefault);
+
+// leaky_relu
+USE_LITE_KERNEL(leaky_relu, kOpenCL, kFP16, kImageDefault, ImageDefault);
+
+// tanh act
+USE_LITE_KERNEL(tanh, kOpenCL, kFP16, kImageDefault, ImageDefault);
+
+// relu image2d fp16
+USE_LITE_KERNEL(relu, kOpenCL, kFP16, kImageDefault, ImageDefault);
+
+// relu6 image2d fp16
+USE_LITE_KERNEL(relu6, kOpenCL, kFP16, kImageDefault, ImageDefault);
+
+// sigmoid image2d fp16
+USE_LITE_KERNEL(sigmoid, kOpenCL, kFP16, kImageDefault, ImageDefault);
diff --git a/lite/kernels/opencl/bilinear_interp_image_compute.cc b/lite/kernels/opencl/bilinear_interp_image_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e32010c0b5ff5cedad8b0da7ce7233fbf73da6f
--- /dev/null
+++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc
@@ -0,0 +1,185 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/logging.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+class BilinearInterpImageCompute
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::InterpolateParam;
+
+  std::string doc() const override {
+    return "BilinearInterp using cl::Image2D(ImageDefault/RGBA), kFP16";
+  }
+
+  void PrepareForRun() override {
+    bilinear_interp_param_ = param_.get_mutable<param_t>();
+
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/bilinear_interp_kernel.cl", build_options_);
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+  }
+
+  void Run() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+
+    auto* x = bilinear_interp_param_->X;
+    auto* out = bilinear_interp_param_->Out;
+    float scale_h = 0.0;
+    float scale_w = 0.0;
+    auto out_dims = out->dims();
+    auto in_dims = x->dims();
+
+    if (bilinear_interp_param_->align_corners) {
+      scale_h = (in_dims[2] - 1.0f) / (out_dims[2] - 1.0f);
+      scale_w = (in_dims[3] - 1.0f) / (out_dims[3] - 1.0f);
+    } else {
+      scale_h = in_dims[2] / static_cast<float>(out_dims[2]);
+      scale_w = in_dims[3] / static_cast<float>(out_dims[3]);
+    }
+    float align_delta = 0.0f;
+    if (!bilinear_interp_param_->align_corners &&
+        bilinear_interp_param_->align_mode == 0) {
+      align_delta = 0.5f;
+    }
+
+    int in_h = in_dims[2];
+    int in_w = in_dims[3];
+    int out_h = out_dims[2];
+    int out_w = out_dims[3];
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "x->target():" << TargetToStr(x->target());
+    VLOG(4) << "out->target():" << TargetToStr(out->target());
+    VLOG(4) << "x->dims():" << in_dims;
+    VLOG(4) << "out->dims():" << out_dims;
+#endif
+
+    auto out_image_shape = InitImageDimInfoWith(out_dims);
+    auto* x_img = x->data<half_t, cl::Image2D>();
+
+    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
+        out_image_shape["width"], out_image_shape["height"]);
+
+#ifndef LITE_SHUTDOWN_LOG
+    // VLOG(4) << "x_image: " << x_img;
+    // VLOG(4) << "out_image: " << out_img;
+    VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " "
+            << out_image_shape["height"];
+
+    VLOG(4) << "scale_h: " << scale_h << ", scale_w: " << scale_w
+            << ", align_delta: " << align_delta;
+    VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w;
+    VLOG(4) << "out_h: " << out_h << ", out_w: " << out_w;
+#endif
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    auto default_work_size =
+        DefaultWorkSize(out_dims,
+                        DDim(std::vector<DDim::value_type>{
+                            static_cast<int64_t>(out_image_shape["width"]),
+                            static_cast<int64_t>(out_image_shape["height"])}));
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
+            << default_work_size[1] << ", " << default_work_size[2];
+#endif
+    cl_int status = kernel.setArg(arg_idx++, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, scale_h);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, scale_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, align_delta);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, in_h);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, in_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, out_h);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, out_w);
+    CL_CHECK_FATAL(status);
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
+                    static_cast<cl::size_type>(default_work_size[1]),
+                    static_cast<cl::size_type>(default_work_size[2])};
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
+            << global_work_size[1] << " " << global_work_size[2];
+#endif
+  }
+
+ protected:
+  param_t* bilinear_interp_param_{nullptr};
+  std::string kernel_func_name_{"bilinear_interp"};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+namespace ocl = paddle::lite::kernels::opencl;
+REGISTER_LITE_KERNEL(bilinear_interp,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::BilinearInterpImageCompute,
+                     ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/bilinear_interp_image_compute_test.cc b/lite/kernels/opencl/bilinear_interp_image_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dec202ef55d797ce270ef46c6f80cc8a3474936f
--- /dev/null
+++ b/lite/kernels/opencl/bilinear_interp_image_compute_test.cc
@@ -0,0 +1,270 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <memory>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+
+#define FP16_MAX_DIFF (5e-1)
+
+namespace paddle {
+namespace lite {
+void bilinear_interp_ref(const float* din,
+                         const DDim& x_dims,
+                         float* dout,
+                         const DDim& out_dims,
+                         bool align_corners,
+                         int align_mode) {
+  int batch_size = x_dims[0];
+  int channel_size = x_dims[1];
+  auto in_h = x_dims[2];
+  auto in_w = x_dims[3];
+
+  int out_h = out_dims[2];
+  int out_w = out_dims[3];
+
+  // copy from x if no change
+  if (in_h == out_h && in_w == out_w) {
+    memcpy(dout, din, sizeof(float) * x_dims.production());
+    return;
+  }
+
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+                              : static_cast<float>(in_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
+                              : static_cast<float>(in_w) / out_w;
+  }
+
+  // naive bilinear interpolation
+  bool align_flag = (align_mode == 0 && !align_corners);
+
+  for (int n = 0; n < batch_size; n++) {
+    float* dout_data = dout + n * channel_size * out_h * out_w;
+    const float* din_data = din + n * channel_size * in_h * in_w;
+    for (int c = 0; c < channel_size; c++) {
+      float* dout_data_c = dout_data + c * out_h * out_w;
+      const float* din_data_c = din_data + c * in_h * in_w;
+      for (int h = 0; h < out_h; h++) {
+        float center_h = align_flag ? (ratio_h * (h + 0.5) - 0.5) : ratio_h * h;
+        int floor_h = static_cast<int>(center_h);
+        int ceil_h = floor_h + 1;
+        floor_h = floor_h > 0 ? floor_h : 0;
+        ceil_h = ceil_h > in_h - 1 ? in_h - 1 : ceil_h;
+        float hs = center_h - floor_h;
+        float he = 1.0 - hs;
+        for (int w = 0; w < out_w; w++) {
+          float center_w =
+              align_flag ? (ratio_w * (w + 0.5) - 0.5) : ratio_w * w;
+          int floor_w = static_cast<int>(center_w);
+          int ceil_w = floor_w + 1;
+          floor_w = floor_w > 0 ? floor_w : 0;
+          ceil_w = ceil_w > in_w - 1 ? in_w - 1 : ceil_w;
+          float ws = center_w - floor_w;
+          float we = 1.0 - ws;
+          float left_up = din_data_c[ceil_h * in_w + floor_w] * we * hs;
+          float left_down = din_data_c[floor_h * in_w + floor_w] * we * he;
+          float right_up = din_data_c[ceil_h * in_w + ceil_w] * ws * hs;
+          float right_down = din_data_c[floor_h * in_w + ceil_w] * ws * he;
+          dout_data_c[h * out_w + w] =
+              left_up + left_down + right_up + right_down;
+        }
+      }
+    }
+  }
+}
+// #define BILINEAR_FP16_LOOP_TEST
+// #define BILINEAR_FP16_PRINT_RESULT
+TEST(bilinear_interp_image2d, compute) {
+#ifdef BILINEAR_FP16_LOOP_TEST
+  for (auto n : {1, 3}) {
+    for (auto c : {1, 3, 8, 23, 32}) {
+      for (auto h : {2, 20, 64, 112}) {
+        for (auto w : {2, 20, 64, 112}) {
+          for (auto out_h : {4, 32, 96, 224}) {
+            for (auto out_w : {4, 32, 96, 224}) {
+              for (auto align_corners : {true, false}) {
+                for (auto align_mode : {0, 1}) {
+#else
+  const int n = 1;
+  const int c = 1;
+  const int h = 2;
+  const int w = 2;
+  const int out_h = 4;
+  const int out_w = 4;
+  const bool align_corners = true;
+  const int align_mode = 0;
+#endif  // BILINEAR_FP16_LOOP_TEST
+
+                  LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c
+                            << " " << h << " " << w << " ========";
+                  LOG(INFO) << "======== parameters: out_h = " << out_h
+                            << ", out_w = " << out_w;
+                  LOG(INFO) << "align_corners: " << align_corners
+                            << ", align_mode: " << align_mode;
+
+                  auto kernels = KernelRegistry::Global().Create(
+                      "bilinear_interp",
+                      TARGET(kOpenCL),
+                      PRECISION(kFP16),
+                      DATALAYOUT(kImageDefault));
+                  ASSERT_FALSE(kernels.empty());
+                  auto kernel = std::move(kernels.front());
+                  LOG(INFO) << "get kernel:" << kernel->doc();
+
+                  lite::Tensor x, out;
+                  operators::InterpolateParam param;
+                  param.X = &x;
+                  param.Out = &out;
+                  param.align_corners = align_corners;
+                  param.align_mode = align_mode;
+
+                  std::unique_ptr<KernelContext> context(new KernelContext);
+                  context->As<OpenCLContext>().InitOnce();
+
+                  kernel->SetParam(param);
+                  std::unique_ptr<KernelContext> bilinear_context(
+                      new KernelContext);
+                  context->As<OpenCLContext>().CopySharedTo(
+                      &(bilinear_context->As<OpenCLContext>()));
+                  kernel->SetContext(std::move(bilinear_context));
+
+                  const DDim in_dim =
+                      DDim(std::vector<DDim::value_type>{n, c, h, w});
+                  const DDim out_dim =
+                      DDim(std::vector<DDim::value_type>{n, c, out_h, out_w});
+                  x.Resize(in_dim);
+                  out.Resize(out_dim);
+
+                  std::default_random_engine engine;
+                  std::uniform_real_distribution<float> dist(-1, 1);
+                  int sum = n * c * h * w;
+                  std::vector<float> input_v(sum);
+                  for (auto& i : input_v) {
+                    i = dist(engine);
+                  }
+
+                  LOG(INFO) << "prepare input";
+                  CLImageConverterDefault* default_converter =
+                      new CLImageConverterDefault();
+                  DDim x_image_shape =
+                      default_converter->InitImageDimInfoWith(in_dim);
+                  LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
+                            << x_image_shape[1];
+                  std::vector<half_t> x_image_data(x_image_shape.production() *
+                                                   4);  // 4 : RGBA
+                  default_converter->NCHWToImage(
+                      input_v.data(), x_image_data.data(), in_dim);
+                  auto* x_image = x.mutable_data<half_t, cl::Image2D>(
+                      x_image_shape[0], x_image_shape[1], x_image_data.data());
+                  // LOG(INFO) << "x_image:" << x_image;
+
+                  DDim out_image_shape =
+                      default_converter->InitImageDimInfoWith(out_dim);
+                  LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+                            << out_image_shape[1];
+                  auto* out_image = out.mutable_data<half_t, cl::Image2D>(
+                      out_image_shape[0], out_image_shape[1]);
+                  // LOG(INFO) << "out_image:" << out_image;
+                  kernel->Launch();
+
+                  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+                  auto* out_ptr = param.Out->data<half_t, cl::Image2D>();
+                  auto it = wait_list->find(out_ptr);
+                  if (it != wait_list->end()) {
+                    VLOG(4) << "--- Find the sync event for the target cl "
+                               "tensor. ---";
+                    auto& event = *(it->second);
+                    event.wait();
+                  } else {
+                    LOG(FATAL) << "Could not find the sync event for the "
+                                  "target cl tensor.";
+                  }
+
+                  std::unique_ptr<float[]> out_ref(
+                      new float[out_dim.production()]);
+                  bilinear_interp_ref(input_v.data(),
+                                      in_dim,
+                                      out_ref.get(),
+                                      out_dim,
+                                      align_corners,
+                                      align_mode);
+
+                  const size_t cl_image2d_row_pitch{0};
+                  const size_t cl_image2d_slice_pitch{0};
+                  half_t* out_image_data =
+                      new half_t[40000];  // out_image_shape.production() * 4
+                  TargetWrapperCL::ImgcpySync(out_image_data,
+                                              out_image,
+                                              out_image_shape[0],
+                                              out_image_shape[1],
+                                              cl_image2d_row_pitch,
+                                              cl_image2d_slice_pitch,
+                                              IoDirection::DtoH);
+                  float* out_data = new float[out_image_shape.production() * 4];
+                  default_converter->ImageToNCHW(
+                      out_image_data, out_data, out_image_shape, out_dim);
+// result
+#ifdef BILINEAR_FP16_PRINT_RESULT
+                  LOG(INFO)
+                      << "---- print kernel result (input -> output) ----";
+                  for (int eidx = 0; eidx < in_dim.production(); ++eidx) {
+                    std::cout << input_v[eidx] << " -> " << out_data[eidx]
+                              << std::endl;
+                  }
+#endif  // BILINEAR_FP16_PRINT_RESULT
+                  for (int i = 0; i < out_dim.production(); i++) {
+                    auto abs_diff = abs(out_data[i] - out_ref[i]);
+                    auto relative_diff =
+                        COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]);
+                    EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
+                                  (abs_diff <= FP16_MAX_DIFF),
+                              true);
+                    if ((relative_diff > FP16_MAX_DIFF) &&
+                        (abs_diff > FP16_MAX_DIFF)) {
+                      LOG(ERROR) << "error idx:" << i << ", in_data[" << i
+                                 << "]: " << input_v[i] << ", out_data[" << i
+                                 << "]: " << out_data[i] << ", out_ref[" << i
+                                 << "]: " << out_ref[i]
+                                 << ", abs_diff: " << abs_diff
+                                 << ", relative_diff: " << relative_diff
+                                 << ", FP16_MAX_DIFF: " << FP16_MAX_DIFF;
+                    }
+                  }
+#ifdef BILINEAR_FP16_LOOP_TEST
+                }  // mode
+              }    // corners
+            }      // out_w
+          }        // out_h
+        }          // w
+      }            // h
+    }              // c
+  }                // n
+#else
+// nothing to do.
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(bilinear_interp, kOpenCL, kFP16, kImageDefault, ImageDefault);
diff --git a/lite/kernels/opencl/concat_buffer_compute.cc b/lite/kernels/opencl/concat_buffer_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..010e7726170ab1f40adc2fcb56a66835ac7d2bd2
--- /dev/null
+++ b/lite/kernels/opencl/concat_buffer_compute.cc
@@ -0,0 +1,203 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class ConcatCompute : public KernelLite<TARGET(kOpenCL),
+                                        PRECISION(kFP16),
+                                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ConcatParam;
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    concat_param_ = param_.get_mutable<param_t>();
+    if (concat_param_->x.size() == 2) {
+      kernel_func_name_ = "concat2";
+    } else {
+      kernel_func_name_ = "concat_mul";
+    }
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "buffer/concat_kernel.cl", build_options_);
+
+    auto axis = concat_param_->axis;
+    auto inputs = concat_param_->x;
+    auto out_dims = concat_param_->output->dims();
+    auto* axis_tensor = concat_param_->axis_tensor;
+    if (axis_tensor != nullptr) {
+      // auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
+      // axis = axis_tensor_data[0];
+    }
+
+    auto in_dims = inputs[0]->dims();
+    axis_size_ = out_dims[axis];
+    axis_ = axis;
+    for (int i = 0; i < axis; i++) {
+      pre_size_ *= in_dims[i];
+    }
+    for (int i = axis + 1; i < in_dims.size(); i++) {
+      post_size_ *= in_dims[i];
+    }
+
+    for (int i = 1; i < inputs.size(); i++) {
+      auto dims = inputs[i]->dims();
+      if (in_dims.size() != dims.size()) {
+        printf("input shape must be same \n");
+        return;
+      }
+      for (int i = 0; i < dims.size(); i++) {
+        if (i != axis) {
+          if (in_dims[i] != dims[i]) {
+            printf("input shape must be same \n");
+            return;
+          }
+        }
+      }
+    }
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.output->dims();
+    auto image_shape = InitImageDimInfoWith(x_dims);
+    auto* out_buf =
+        param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+    const auto& y_dims = param.output->dims();  // useless: check dim only
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+
+    auto inputs = param.x;
+    int arg_idx = 0;
+    auto global_work_size = cl::NDRange{axis_size_};
+    int total = axis_size_ * post_size_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    if (inputs.size() == 2) {
+      auto* x_buf0 = inputs[0]->data<float, cl::Buffer>();
+      auto* x_buf1 = inputs[1]->data<float, cl::Buffer>();
+      auto axis0 = inputs[0]->dims()[axis_];
+      int total0 = axis0 * post_size_;
+      int total1 = (axis_size_ - axis0) * post_size_;
+      cl_int status = kernel.setArg(arg_idx, *x_buf0);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *x_buf1);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_buf);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<int>(axis0));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, axis_size_);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, pre_size_);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, post_size_);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, total);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, total0);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, total1);
+      CL_CHECK_FATAL(status);
+      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+          kernel,
+          cl::NullRange,
+          global_work_size,
+          cl::NullRange,
+          nullptr,
+          event_.get());
+      CL_CHECK_FATAL(status);
+      context.cl_wait_list()->emplace(out_buf, event_);
+    } else {
+      auto start = 0;
+      for (int i = 0; i < inputs.size(); i++) {
+        arg_idx = 0;
+        int size = inputs[i]->dims()[axis_];
+        auto* x_buf = inputs[i]->data<float, cl::Buffer>();
+        global_work_size = cl::NDRange{static_cast<size_t>(size)};
+        int total0 = size * post_size_;
+        cl_int status = kernel.setArg(arg_idx, *x_buf);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, *out_buf);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, static_cast<int>(size));
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, pre_size_);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, post_size_);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, start);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, total);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, total0);
+        CL_CHECK_FATAL(status);
+        status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+            kernel,
+            cl::NullRange,
+            global_work_size,
+            cl::NullRange,
+            nullptr,
+            event_.get());
+        CL_CHECK_FATAL(status);
+        context.cl_wait_list()->emplace(out_buf, event_);
+        start += size;
+      }
+    }
+  }
+
+  std::string doc() { return "Concat using cl::Buffer, kFloat"; }
+
+  int axis_size_ = 1;
+  int post_size_ = 1;
+  int pre_size_ = 1;
+  int axis_ = 1;
+  param_t* concat_param_{nullptr};
+  std::string kernel_func_name_{};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+typedef paddle::lite::kernels::opencl::ConcatCompute Concat_buffer;
+
+REGISTER_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, Concat_buffer, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/opencl/concat_buffer_compute_test.cc b/lite/kernels/opencl/concat_buffer_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b8dff344f71398f749d9f051b0b991e37b77acb7
--- /dev/null
+++ b/lite/kernels/opencl/concat_buffer_compute_test.cc
@@ -0,0 +1,176 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// //
+// // Licensed under the Apache License, Version 2.0 (the "License");
+// // you may not use this file except in compliance with the License.
+// // You may obtain a copy of the License at
+// //
+// //     http://www.apache.org/licenses/LICENSE-2.0
+// //
+// // Unless required by applicable law or agreed to in writing, software
+// // distributed under the License is distributed on an "AS IS" BASIS,
+// // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// // See the License for the specific language governing permissions and
+// // limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename dtype>
+void concat2_compute_ref(const dtype *in0,
+                         const dtype *in1,
+                         const int axis,
+                         const DDim in0_dim,
+                         const DDim in1_dim,
+                         const DDim out_dim,
+                         dtype *out_data) {
+  int pre_size = 1;
+  int post_size = 1;
+  for (int i = 0; i < axis; i++) {
+    pre_size *= in0_dim[i];
+  }
+  for (int i = axis + 1; i < in0_dim.size(); i++) {
+    post_size *= in0_dim[i];
+  }
+  int axis_size = out_dim[axis];
+  for (int i = 0; i < pre_size; i++) {
+    for (int j = 0; j < axis_size; j++) {
+      if (j < in0_dim[axis]) {
+        memcpy(out_data, in0, sizeof(dtype) * post_size);
+        in0 += post_size;
+        out_data += post_size;
+      }
+    }
+  }
+}
+
+template <typename dtype>
+void concat_mul_compute_ref(std::vector<const dtype *> ins_data,
+                            std::vector<const DDim> ins_dim,
+                            int axis,
+                            const DDim out_dim,
+                            dtype *out_data) {
+  int pre_size = 1;
+  int post_size = 1;
+  for (int i = 0; i < axis; i++) {
+    pre_size *= ins_dim[0][i];
+  }
+  for (int i = axis + 1; i < ins_dim[0].size(); i++) {
+    post_size *= ins_dim[0][i];
+  }
+  int axis_size = out_dim[axis];
+  for (int i = 0; i < pre_size; i++) {
+    for (int j = 0; j < ins_data.size(); j++) {
+      int size = post_size * ins_dim[j][axis];
+      memcpy(out_data, ins_data[j], sizeof(dtype) * size);
+      out_data += size;
+    }
+  }
+}
+
+TEST(opencl_concat_buffer, compute) {
+  // prepare data
+  const DDim x0_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
+  const DDim x1_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
+  const DDim x2_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{1, 6, 3, 4});
+  lite::Tensor x0, x1, x2, out, out_ref;
+  x0.Resize(x0_dim);
+  x1.Resize(x1_dim);
+  x2.Resize(x2_dim);
+  out.Resize(out_dim);
+  out_ref.Resize(out_dim);
+
+  auto *x0_data = x0.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  auto *x1_data = x1.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  auto *x2_data = x2.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-10, 10);
+  auto *mapped_x0 = static_cast<float *>(
+      TargetWrapperCL::Map(x0_data, 0, sizeof(float) * x0_dim.production()));
+  auto *mapped_x1 = static_cast<float *>(
+      TargetWrapperCL::Map(x1_data, 0, sizeof(float) * x1_dim.production()));
+  auto *mapped_x2 = static_cast<float *>(
+      TargetWrapperCL::Map(x2_data, 0, sizeof(float) * x2_dim.production()));
+  for (int i = 0; i < x0_dim.production(); i++) {
+    mapped_x0[i] = dist(engine);
+  }
+  for (int i = 0; i < x1_dim.production(); i++) {
+    mapped_x1[i] = dist(engine);
+  }
+  for (int i = 0; i < x2_dim.production(); i++) {
+    mapped_x2[i] = dist(engine);
+  }
+
+  // set param and kernel, then run
+  operators::ConcatParam param;
+  std::vector<lite::Tensor *> ins;
+  ins.push_back(&x0);
+  ins.push_back(&x1);
+  ins.push_back(&x2);
+  auto axis = 1;
+  param.x = ins;
+  param.output = &out;
+  param.axis = axis;
+
+  std::vector<const float *> ins_data;
+  std::vector<const DDim> ins_dim;
+
+  ins_data.push_back(mapped_x0);
+  ins_data.push_back(mapped_x1);
+  ins_data.push_back(mapped_x2);
+  ins_dim.push_back(x0_dim);
+  ins_dim.push_back(x1_dim);
+  ins_dim.push_back(x2_dim);
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+  auto kernels = KernelRegistry::Global().Create(
+      "concat", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> concat_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(concat_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(concat_context));
+  kernel->Launch();
+
+  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto *out_ptr = param.output->data<float, cl::Buffer>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto &event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  // run compute ref and check
+  auto *out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+  concat_mul_compute_ref<float>(ins_data, ins_dim, axis, out_dim, out_ref_data);
+
+  auto *out_data = out.mutable_data<float, cl::Buffer>();
+  auto *mapped_out = static_cast<float *>(
+      TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
+  for (int i = 0; i < out_dim.production(); i++) {
+    EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
+  }
+  TargetWrapperCL::Unmap(out_data, mapped_out);
+  TargetWrapperCL::Unmap(x0_data, mapped_x0);
+  TargetWrapperCL::Unmap(x1_data, mapped_x1);
+  TargetWrapperCL::Unmap(x2_data, mapped_x2);
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+// concat buffer
+USE_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, def);
diff --git a/lite/kernels/opencl/concat_compute.cc b/lite/kernels/opencl/concat_compute.cc
deleted file mode 100644
index c57602e39aea27250eabfcf7a0570d80d7ff3dc4..0000000000000000000000000000000000000000
--- a/lite/kernels/opencl/concat_compute.cc
+++ /dev/null
@@ -1,372 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/opencl/concat_compute.h"
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/kernels/opencl/image_helper.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/replace_stl/stream.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-
-template <>
-void ConcatCompute<PRECISION(kFloat),
-                   DATALAYOUT(kImageDefault)>::PrepareForRun() {
-  auto& context = ctx_->As<OpenCLContext>();
-  concat_param_ = param_.get_mutable<param_t>();
-  if (concat_param_->x.size() == 2) {
-    kernel_func_name_ = "concat2";
-  } else {
-    kernel_func_name_ = "concat_mul";
-  }
-  context.cl_context()->AddKernel(
-      kernel_func_name_, "image/concat_kernel.cl", build_options_);
-  // UpdateParams<kFloat, kImageDefault>();
-  auto axis = concat_param_->axis;
-  auto inputs = concat_param_->x;
-  auto out_dims = concat_param_->output->dims();
-  auto* axis_tensor = concat_param_->axis_tensor;
-  if (axis_tensor != nullptr) {
-    // auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
-    // axis = axis_tensor_data[0];
-  }
-  auto in_dims = inputs[0]->dims();
-  axis_size_ = out_dims[axis];
-  axis_ = axis;
-  for (int i = 0; i < axis; i++) {
-    pre_size_ *= in_dims[i];
-  }
-  for (int i = axis + 1; i < in_dims.size(); i++) {
-    post_size_ *= in_dims[i];
-  }
-  for (int i = 1; i < inputs.size(); i++) {
-    auto dims = inputs[i]->dims();
-    // auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size());
-    if (in_dims.size() != dims.size()) {
-      printf("input shape must be same \n");
-      return;
-    }
-    for (int i = 0; i < dims.size(); i++) {
-      if (i != axis) {
-        if (in_dims[i] != dims[i]) {
-          printf("input shape must be same \n");
-          return;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kImageDefault)>::Run() {
-  auto& param = *param_.get_mutable<param_t>();
-  const auto& x_dims = param.output->dims();
-  auto image_shape = InitImageDimInfoWith(x_dims);
-  auto* out_buf = param.output->mutable_data<float, cl::Image2D>(
-      image_shape["width"], image_shape["height"]);
-  const auto& y_dims = param.output->dims();  // useless: check dim only
-
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_name_ << build_options_;
-
-  auto inputs = param.x;
-  int arg_idx = 0;
-  int width = inputs[0]->dims()[-1];
-  auto global_work_size =
-      cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
-                  static_cast<cl::size_type>(image_shape["height"])};
-  VLOG(4) << TargetToStr(param.output->target());
-  VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-          << image_shape["height"];
-  VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
-          << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
-  VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
-          << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-  int flag = 1;  // cxw
-  switch (axis_) {
-    case 0:
-      width = x_dims[2];  // n
-      flag = 0;
-      break;
-    case 1:
-      width = x_dims[3];  // c
-      break;
-    case 2:
-      width = x_dims[0];  // h
-      flag = 0;
-      break;
-    case 3:
-    case -1:
-      width = x_dims[1];  // w
-      break;
-    default:
-      printf("this axis: %d does not support \n", axis_);
-  }
-  if (inputs.size() == 2) {
-    auto* x_buf0 = inputs[0]->data<float, cl::Image2D>();
-    auto* x_buf1 = inputs[1]->data<float, cl::Image2D>();
-    cl_int status = kernel.setArg(arg_idx, *x_buf0);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *x_buf1);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-    status =
-        kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, flag);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, width);
-    CL_CHECK_FATAL(status);
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_context()->GetCommandQueue().finish();
-  } else {
-    auto start = 0;
-    for (int i = 0; i < inputs.size(); i++) {
-      arg_idx = 0;
-      auto* x_buf = inputs[i]->data<float, cl::Image2D>();
-      cl_int status = kernel.setArg(arg_idx, *x_buf);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *out_buf);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, axis_size_);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, start);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, flag);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, width);
-      CL_CHECK_FATAL(status);
-      CL_CHECK_FATAL(status);
-      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-          kernel,
-          cl::NullRange,
-          global_work_size,
-          cl::NullRange,
-          nullptr,
-          event_.get());
-      CL_CHECK_FATAL(status);
-      context.cl_context()->GetCommandQueue().finish();
-      start += inputs[i]->dims()[axis_];
-    }
-  }
-}
-
-template <>
-std::string ConcatCompute<PRECISION(kFloat), DATALAYOUT(kImageDefault)>::doc() {
-  return "Concat using cl::Image, kFloat";
-}
-
-template <>
-void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::PrepareForRun() {
-  auto& context = ctx_->As<OpenCLContext>();
-  concat_param_ = param_.get_mutable<param_t>();
-  if (concat_param_->x.size() == 2) {
-    kernel_func_name_ = "concat2";
-  } else {
-    kernel_func_name_ = "concat_mul";
-  }
-  context.cl_context()->AddKernel(
-      kernel_func_name_, "buffer/concat_kernel.cl", build_options_);
-
-  //  UpdateParams<kFloat, kImageDefault>();
-  auto axis = concat_param_->axis;
-  auto inputs = concat_param_->x;
-  auto out_dims = concat_param_->output->dims();
-  auto* axis_tensor = concat_param_->axis_tensor;
-  if (axis_tensor != nullptr) {
-    //   auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
-    //  axis = axis_tensor_data[0];
-  }
-  auto in_dims = inputs[0]->dims();
-  axis_size_ = out_dims[axis];
-  axis_ = axis;
-  for (int i = 0; i < axis; i++) {
-    pre_size_ *= in_dims[i];
-  }
-  for (int i = axis + 1; i < in_dims.size(); i++) {
-    post_size_ *= in_dims[i];
-  }
-  for (int i = 1; i < inputs.size(); i++) {
-    auto dims = inputs[i]->dims();
-    if (in_dims.size() != dims.size()) {
-      printf("input shape must be same \n");
-      return;
-    }
-    for (int i = 0; i < dims.size(); i++) {
-      if (i != axis) {
-        if (in_dims[i] != dims[i]) {
-          printf("input shape must be same \n");
-          return;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::Run() {
-  auto& param = *param_.get_mutable<param_t>();
-  const auto& x_dims = param.output->dims();
-  auto image_shape = InitImageDimInfoWith(x_dims);
-  auto* out_buf =
-      param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  const auto& y_dims = param.output->dims();  // useless: check dim only
-
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_name_ << build_options_;
-
-  auto inputs = param.x;
-  int arg_idx = 0;
-  auto global_work_size = cl::NDRange{axis_size_};
-  int total = axis_size_ * post_size_;
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-  if (inputs.size() == 2) {
-    auto* x_buf0 = inputs[0]->data<float, cl::Buffer>();
-    auto* x_buf1 = inputs[1]->data<float, cl::Buffer>();
-    auto axis0 = inputs[0]->dims()[axis_];
-    int total0 = axis0 * post_size_;
-    int total1 = (axis_size_ - axis0) * post_size_;
-    cl_int status = kernel.setArg(arg_idx, *x_buf0);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *x_buf1);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<int>(axis0));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, axis_size_);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, pre_size_);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, post_size_);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, total);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, total0);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, total1);
-    CL_CHECK_FATAL(status);
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_buf, event_);
-  } else {
-    auto start = 0;
-    for (int i = 0; i < inputs.size(); i++) {
-      arg_idx = 0;
-      int size = inputs[i]->dims()[axis_];
-      auto* x_buf = inputs[i]->data<float, cl::Buffer>();
-      global_work_size = cl::NDRange{static_cast<size_t>(size)};
-      int total0 = size * post_size_;
-      cl_int status = kernel.setArg(arg_idx, *x_buf);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, *out_buf);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, static_cast<int>(size));
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, pre_size_);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, post_size_);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, start);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, total);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(++arg_idx, total0);
-      CL_CHECK_FATAL(status);
-      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-          kernel,
-          cl::NullRange,
-          global_work_size,
-          cl::NullRange,
-          nullptr,
-          event_.get());
-      CL_CHECK_FATAL(status);
-      context.cl_wait_list()->emplace(out_buf, event_);
-      start += size;
-    }
-  }
-}
-
-template <>
-std::string ConcatCompute<PRECISION(kFloat), DATALAYOUT(kNCHW)>::doc() {
-  return "Concat using cl::Buffer, kFloat";
-}
-
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-typedef paddle::lite::kernels::opencl::ConcatCompute<PRECISION(kFloat),
-                                                     DATALAYOUT(kNCHW)>
-    Concat_buffer;
-
-typedef paddle::lite::kernels::opencl::ConcatCompute<PRECISION(kFloat),
-                                                     DATALAYOUT(kImageDefault)>
-    Concat_image;
-
-REGISTER_LITE_KERNEL(
-    concat, kOpenCL, kFloat, kImageDefault, Concat_image, ImageDefault)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindInput("AxisTensor",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kInt32),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
-
-// REGISTER_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, Concat_buffer, def)
-//     .BindInput("X",
-//                {LiteType::GetTensorTy(TARGET(kOpenCL),
-//                                       PRECISION(kFloat),
-//                                       DATALAYOUT(kNCHW))})
-//     .BindInput("AxisTensor",
-//                {LiteType::GetTensorTy(TARGET(kOpenCL),
-//                                       PRECISION(kInt32),
-//                                       DATALAYOUT(kNCHW))})
-//     .BindOutput("Out",
-//                 {LiteType::GetTensorTy(TARGET(kOpenCL),
-//                                        PRECISION(kFloat),
-//                                        DATALAYOUT(kNCHW))})
-//     .Finalize();
diff --git a/lite/kernels/opencl/concat_image_compute.cc b/lite/kernels/opencl/concat_image_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..95e64025662a4b87cd68c211ccc0b0fb7b84a9f2
--- /dev/null
+++ b/lite/kernels/opencl/concat_image_compute.cc
@@ -0,0 +1,278 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
+                                             PRECISION(kFP16),
+                                             DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ConcatParam;
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    concat_param_ = param_.get_mutable<param_t>();
+    if (concat_param_->x.size() == 2) {
+      kernel_func_name_ = "concat2";
+    } else {
+      kernel_func_name_ = "concat_mul";
+    }
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/concat_kernel.cl", build_options_);
+
+    auto axis = concat_param_->axis;
+    auto inputs = concat_param_->x;
+    auto out_dims = concat_param_->output->dims();
+    auto* axis_tensor = concat_param_->axis_tensor;
+    if (axis_tensor != nullptr) {
+      // auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
+      // axis = axis_tensor_data[0];
+    }
+    auto in_dims = inputs[0]->dims();
+    axis_size_ = out_dims[axis];
+    axis_ = axis;
+    if (out_dims.size() < 4) {
+      if (out_dims.size() - axis == 1) {
+        // width
+        width_ = out_dims[1];  // c
+        flag_ = 3;
+      } else {
+        // height
+        width_ = out_dims[0];  // n
+        flag_ = 2;
+      }
+    } else {
+      switch (axis_) {
+        case 0:
+          width_ = out_dims[2];  // h
+          flag_ = 0;
+          break;
+        case 1:                  // channel
+          width_ = out_dims[3];  // w
+          flag_ = 1;
+          break;
+        case 2:                  // height
+          width_ = out_dims[0];  // n
+          flag_ = 2;
+          break;
+        case 3:
+        case -1:                 // width
+          width_ = out_dims[1];  // c
+          flag_ = 3;
+          break;
+        default:
+          printf("this axis: %d does not support \n", axis_);
+      }
+    }
+
+    for (int i = 1; i < inputs.size(); i++) {
+      auto dims = inputs[i]->dims();
+      // auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size());
+      if (in_dims.size() != dims.size()) {
+        printf("input shape must be same \n");
+        return;
+      }
+      for (int i = 0; i < dims.size(); i++) {
+        if (i != axis) {
+          if (in_dims[i] != dims[i]) {
+            printf("input shape must be same \n");
+            return;
+          }
+        }
+      }
+    }
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.output->dims();
+    auto image_shape = InitImageDimInfoWith(x_dims);
+    auto* out_buf = param.output->mutable_data<half_t, cl::Image2D>(
+        image_shape["width"], image_shape["height"]);
+    const auto& y_dims = param.output->dims();  // useless: check dim only
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+
+    auto inputs = param.x;
+    int arg_idx = 0;
+    int width = inputs[0]->dims()[inputs[0]->dims().size() - 1];
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "concat input shape:  ";
+    for (size_t i = 0; i < inputs.size(); i++) {
+      VLOG(4) << "inputs [" << i << "]"
+              << "[" << inputs[i]->dims().size() << "D]:"
+              << "   dims:" << inputs[i]->dims()[0] << " "
+              << inputs[i]->dims()[1] << " " << inputs[i]->dims()[2] << " "
+              << inputs[i]->dims()[3];
+    }
+
+    VLOG(4) << "concat output shape:  ";
+    VLOG(4) << " out  dims:  "
+            << "[" << x_dims.size() << "D]:" << x_dims[0] << " " << x_dims[1]
+            << " " << x_dims[2] << " " << x_dims[3];
+    VLOG(4) << "axis_: " << axis_;
+    VLOG(4) << "flag_: " << flag_;
+#endif
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(x_dims[x_dims.size() - 1]),
+                    static_cast<cl::size_type>(image_shape["width"] /
+                                               x_dims[x_dims.size() - 1]),
+                    static_cast<cl::size_type>(image_shape["height"])};
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << TargetToStr(param.output->target());
+    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+            << image_shape["height"];
+    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3]
+            << "x_dims[x_dims.size() - 1]" << x_dims[x_dims.size() - 1];
+    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+    VLOG(4) << "width_: " << width_ << ", flag_: " << flag_;
+    VLOG(4) << "global_work_size: " << x_dims[x_dims.size() - 1] << "  "
+            << (image_shape["width"] / x_dims[x_dims.size() - 1]) << "  "
+            << (image_shape["height"]);
+#endif
+
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    int out_w = x_dims[x_dims.size() - 1];
+    int out_c = x_dims[1];
+    if (inputs.size() == 2) {
+      auto* x_buf0 = inputs[0]->data<half_t, cl::Image2D>();
+      auto* x_buf1 = inputs[1]->data<half_t, cl::Image2D>();
+      cl_int status = kernel.setArg(arg_idx, *x_buf0);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *x_buf1);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_buf);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, flag_);
+      CL_CHECK_FATAL(status);
+      status =
+          kernel.setArg(++arg_idx, static_cast<int>(inputs[0]->dims()[axis_]));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, out_c);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, out_w);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, width_);
+      CL_CHECK_FATAL(status);
+      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+          kernel,
+          cl::NullRange,
+          global_work_size,
+          cl::NullRange,
+          nullptr,
+          event_.get());
+      CL_CHECK_FATAL(status);
+      context.cl_wait_list()->emplace(out_buf, event_);
+    } else {
+      auto start = 0;
+      for (int i = 0; i < inputs.size(); i++) {
+        arg_idx = 0;
+        auto in_dims = inputs[i]->dims();
+        image_shape = InitImageDimInfoWith(in_dims);
+        auto* x_buf = inputs[i]->data<half_t, cl::Image2D>();
+        int in_w = in_dims[in_dims.size() - 1];
+#ifndef LITE_SHUTDOWN_LOG
+        VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+                << image_shape["height"];
+#endif
+        global_work_size =
+            cl::NDRange{static_cast<cl::size_type>(in_dims[in_dims.size() - 1]),
+                        static_cast<cl::size_type>(image_shape["width"] /
+                                                   in_dims[in_dims.size() - 1]),
+                        static_cast<cl::size_type>(image_shape["height"])};
+        cl_int status = kernel.setArg(arg_idx, *x_buf);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, *out_buf);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, flag_);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, start);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, out_c);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, out_w);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, in_w);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(++arg_idx, width_);
+        CL_CHECK_FATAL(status);
+        CL_CHECK_FATAL(status);
+        status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+            kernel,
+            cl::NullRange,
+            global_work_size,
+            cl::NullRange,
+            nullptr,
+            event_.get());
+        CL_CHECK_FATAL(status);
+        context.cl_wait_list()->emplace(out_buf, event_);
+        start += inputs[i]->dims()[axis_];
+      }
+    }
+  }
+
+  std::string doc() { return "Concat using cl::Image, kFP16"; }
+
+  int axis_size_ = 1;
+  int axis_ = 1;
+  int flag_ = 1;
+  int width_ = 1;
+  param_t* concat_param_{nullptr};
+  std::string kernel_func_name_{};
+  std::string build_options_{" -DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+typedef paddle::lite::kernels::opencl::ConcatComputeImage Concat_image;
+
+REGISTER_LITE_KERNEL(
+    concat, kOpenCL, kFP16, kImageDefault, Concat_image, ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/concat_compute_test.cc b/lite/kernels/opencl/concat_image_compute_test.cc
similarity index 69%
rename from lite/kernels/opencl/concat_compute_test.cc
rename to lite/kernels/opencl/concat_image_compute_test.cc
index 9af0666cc9bdef184654a026bbfb6004c2ccdd18..38958acfbccecdf1d8e96a2d571e0804e172d049 100644
--- a/lite/kernels/opencl/concat_compute_test.cc
+++ b/lite/kernels/opencl/concat_image_compute_test.cc
@@ -18,6 +18,9 @@
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
 #include "lite/kernels/opencl/image_helper.h"
+#include "lite/kernels/opencl/test_helper.h"
+
+#define FP16_MAX_DIFF (5e-1)
 
 namespace paddle {
 namespace lite {
@@ -73,106 +76,10 @@ void concat_mul_compute_ref(std::vector<const dtype *> ins_data,
     }
   }
 }
-#if 0   // concat_buffer
-TEST(opencl_concat_buffer, compute) {
-  // prepare data
-  const DDim x0_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
-  const DDim x1_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
-  const DDim x2_dim = DDim(std::vector<DDim::value_type>{1, 2, 3, 4});
-  const DDim out_dim = DDim(std::vector<DDim::value_type>{1, 6, 3, 4});
-  lite::Tensor x0, x1, x2, out, out_ref;
-  x0.Resize(x0_dim);
-  x1.Resize(x1_dim);
-  x2.Resize(x2_dim);
-  out.Resize(out_dim);
-  out_ref.Resize(out_dim);
-
-  auto *x0_data = x0.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  auto *x1_data = x1.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  auto *x2_data = x2.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-10, 10);
-  auto *mapped_x0 = static_cast<float *>(
-      TargetWrapperCL::Map(x0_data, 0, sizeof(float) * x0_dim.production()));
-  auto *mapped_x1 = static_cast<float *>(
-      TargetWrapperCL::Map(x1_data, 0, sizeof(float) * x1_dim.production()));
-  auto *mapped_x2 = static_cast<float *>(
-      TargetWrapperCL::Map(x2_data, 0, sizeof(float) * x2_dim.production()));
-  for (int i = 0; i < x0_dim.production(); i++) {
-    mapped_x0[i] = dist(engine);
-  }
-  for (int i = 0; i < x1_dim.production(); i++) {
-    mapped_x1[i] = dist(engine);
-  }
-  for (int i = 0; i < x2_dim.production(); i++) {
-    mapped_x2[i] = dist(engine);
-  }
-
-  // set param and kernel, then run
-  operators::ConcatParam param;
-  std::vector<lite::Tensor *> ins;
-  ins.push_back(&x0);
-  ins.push_back(&x1);
-  ins.push_back(&x2);
-  auto axis = 1;
-  param.x = ins;
-  param.output = &out;
-  param.axis = axis;
-
-  std::vector<const float *> ins_data;
-  std::vector<const DDim> ins_dim;
-
-  ins_data.push_back(mapped_x0);
-  ins_data.push_back(mapped_x1);
-  ins_data.push_back(mapped_x2);
-  ins_dim.push_back(x0_dim);
-  ins_dim.push_back(x1_dim);
-  ins_dim.push_back(x2_dim);
-
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-  auto kernels = KernelRegistry::Global().Create(
-      "concat", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-  auto kernel = std::move(kernels.front());
-  kernel->SetParam(param);
-  std::unique_ptr<KernelContext> concat_context(new KernelContext);
-  context->As<OpenCLContext>().CopySharedTo(
-      &(concat_context->As<OpenCLContext>()));
-  kernel->SetContext(std::move(concat_context));
-  kernel->Launch();
-
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto *out_ptr = param.output->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
-
-  // run compute ref and check
-  auto *out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
-  concat_mul_compute_ref<float>(ins_data, ins_dim, axis, out_dim, out_ref_data);
-
-  auto *out_data = out.mutable_data<float, cl::Buffer>();
-  auto *mapped_out = static_cast<float *>(
-      TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
-  for (int i = 0; i < out_dim.production(); i++) {
-    EXPECT_NEAR(mapped_out[i], out_ref_data[i], 1e-6);
-  }
-  TargetWrapperCL::Unmap(out_data, mapped_out);
-  TargetWrapperCL::Unmap(x0_data, mapped_x0);
-  TargetWrapperCL::Unmap(x1_data, mapped_x1);
-  TargetWrapperCL::Unmap(x2_data, mapped_x2);
-}
-#endif  // concat_buffer
 
 // #define LOOP_TEST
 // #define PRINT_RESULT
-TEST(concat_image2d_fp32, compute) {
+TEST(concat_image2d, compute) {
   LOG(INFO) << "main steps of test: host -> layout(buf2img) -> concat(img) -> "
                "layout(img2buf) "
                "-> host";
@@ -209,7 +116,7 @@ TEST(concat_image2d_fp32, compute) {
             auto concat_img_kernels =
                 KernelRegistry::Global().Create("concat",
                                                 TARGET(kOpenCL),
-                                                PRECISION(kFloat),
+                                                PRECISION(kFP16),
                                                 DATALAYOUT(kImageDefault));
             ASSERT_FALSE(buf_to_img_kernels.empty());
             ASSERT_FALSE(buf_to_img_kernels1.empty());
@@ -284,14 +191,18 @@ TEST(concat_image2d_fp32, compute) {
             for (int i = 0; i < out_dim.production(); ++i) {
               mapped_y[i] = static_cast<int>(0);
             }
-            auto *concat_in_data0 = concat_in0.mutable_data<float, cl::Image2D>(
-                concat_image2d_shape_in0["width"],
-                concat_image2d_shape_in0["height"]);
-            auto *concat_in_data1 = concat_in1.mutable_data<float, cl::Image2D>(
-                concat_image2d_shape_in1["width"],
-                concat_image2d_shape_in1["height"]);
-            auto *concat_out_data = concat_out.mutable_data<float, cl::Image2D>(
-                concat_image2d_shape["width"], concat_image2d_shape["height"]);
+            auto *concat_in_data0 =
+                concat_in0.mutable_data<half_t, cl::Image2D>(
+                    concat_image2d_shape_in0["width"],
+                    concat_image2d_shape_in0["height"]);
+            auto *concat_in_data1 =
+                concat_in1.mutable_data<half_t, cl::Image2D>(
+                    concat_image2d_shape_in1["width"],
+                    concat_image2d_shape_in1["height"]);
+            auto *concat_out_data =
+                concat_out.mutable_data<half_t, cl::Image2D>(
+                    concat_image2d_shape["width"],
+                    concat_image2d_shape["height"]);
 
             // set context and kernel args
             LOG(INFO) << "set context and kernel args";
@@ -334,6 +245,21 @@ TEST(concat_image2d_fp32, compute) {
             LOG(INFO) << "run kernel: img_to_buf_kernel";
             img_to_buf_kernel->Launch();
 
+            // wait for opencl
+            auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+            auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
+            auto it = wait_list->find(out_ptr);
+
+            if (it != wait_list->end()) {
+              VLOG(4) << "--- Find the sync event for the target cl "
+                         "tensor. ---";
+              auto &event = *(it->second);
+              event.wait();
+            } else {
+              LOG(FATAL) << "Could not find the sync event for the target "
+                            "cl tensor.";
+            }
+
             // compute ref cp_u
             std::vector<const float *> ins_ptr;
             std::vector<const DDim> in_dim;
@@ -347,22 +273,35 @@ TEST(concat_image2d_fp32, compute) {
 #ifdef PRINT_RESULT
             LOG(INFO) << "---- print kernel result (input -> output) ----";
             for (int eidx = 0; eidx < out_dim.production(); ++eidx) {
-              std::cout << mapped_x0[eidx] << ", " << mapped_x1[eidx] << " -> "
-                        << mapped_y[eidx] << std::endl;
+              std::cout << "x0[" << eidx << "]:" << mapped_x0[eidx] << ",\t x1["
+                        << eidx << "]:" << mapped_x1[eidx] << " -> y[" << eidx
+                        << "]:" << mapped_y[eidx] << "\t, y_ref[" << eidx
+                        << "]:" << y_data_ref[eidx] << ",\t IS_DIFF_PASSED:"
+                        << IS_DIFF_PASSED(
+                               y_data_ref[eidx], mapped_y[eidx], FP16_MAX_DIFF)
+                        << std::endl;
             }
 #endif  // PRINT_RESULT
 
             // check result: compare kernel output and cpu output(y_data_ref)
-            for (int eidx = 0; eidx < out_dim.production(); eidx++) {
-              EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
-              if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
-                LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
-                          << " / " << x0_dim.production() << ", y_data_ref["
-                          << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
-                          << eidx << "]:" << mapped_y[eidx];
+            for (int i = 0; i < out_dim.production(); i++) {
+              auto abs_diff = abs(y_data_ref[i] - mapped_y[i]);
+              auto relative_diff =
+                  COMPUTE_RELATIVE_DIFF(y_data_ref[i], mapped_y[i]);
+              EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
+                            (abs_diff <= FP16_MAX_DIFF),
+                        true);
+              if ((relative_diff > FP16_MAX_DIFF) &&
+                  (abs_diff > FP16_MAX_DIFF)) {
+                LOG(ERROR) << "error idx:" << i << " mapped_y[" << i
+                           << "]:" << mapped_y[i] << " y_data_ref[" << i
+                           << "]:" << y_data_ref[i] << " abs_diff:" << abs_diff
+                           << " relative_diff:" << relative_diff
+                           << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
                 break;
               }
             }
+
             // free
             LOG(INFO) << "free: unmap x, y";
             TargetWrapperCL::Unmap(x_data0, mapped_x0);
@@ -382,9 +321,9 @@ TEST(concat_image2d_fp32, compute) {
 }  // namespace paddle
 
 // concat buffer
-// USE_LITE_KERNEL(concat, kOpenCL, kFloat, kNCHW, def);
+// USE_LITE_KERNEL(concat, kOpenCL, kFP16, kNCHW, def);
 
 // concat image2d fp32
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
-USE_LITE_KERNEL(concat, kOpenCL, kFloat, kImageDefault, ImageDefault);
+USE_LITE_KERNEL(concat, kOpenCL, kFP16, kImageDefault, ImageDefault);
diff --git a/lite/kernels/opencl/conv_buffer_compute.cc b/lite/kernels/opencl/conv_buffer_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..65477e89c7d00408bf4d639138dea936a61a3d70
--- /dev/null
+++ b/lite/kernels/opencl/conv_buffer_compute.cc
@@ -0,0 +1,329 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/opencl/conv_buffer_compute.h"
+
+#include <sstream>
+
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+void ConvCompute::PrepareForRun() {
+  const auto& param = this->Param<param_t>();
+  auto x_dims = param.x->dims();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+
+  int bs = x_dims[0];
+  int c_in = x_dims[1];
+  int h_out = output_dims[2];
+  int w_out = output_dims[3];
+  int kernel_h = filter_dims[2];  // oihw
+  int kernel_w = filter_dims[3];
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+  int stride_h = param.strides[0];
+  int stride_w = param.strides[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
+  int groups = param.groups;
+  bool relu_fused = param.fuse_relu;
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
+  bool zero_pad = (pad_h == 0) && (pad_w == 0);
+
+  bool pad_equal =
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+
+  VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
+  VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
+          << " stride_w:" << stride_w << " pad_h:" << pad_h
+          << " pad_w:" << pad_w << " kernel_h:" << kernel_h
+          << " kernel_h:" << kernel_h;
+  VLOG(3) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
+          << " " << x_dims[3];
+  VLOG(3) << "output_dims:" << output_dims[0] << " " << output_dims[1] << " "
+          << output_dims[2] << " " << output_dims[3];
+  VLOG(3) << "filter_dims:" << filter_dims[0] << " " << filter_dims[1] << " "
+          << filter_dims[2] << " " << filter_dims[3];
+
+  if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 &&
+      zero_pad && no_dilation && pad_equal) {
+    // conv2d_1x1
+    /* TODO(ysh329): CL_OUT_OF_MEMORY when use gemm_batched OpenCL kernel,
+                 use gemm_batched_naive instead.
+    kernel_func_names_.push_back("gemm_batch");
+  */
+    kernel_func_names_.push_back("gemm_batch_naive");
+    kernel_func_paths_.push_back("buffer/fc_kernel.cl");
+    if (relu_fused) {
+      build_options_.push_back("-DCL_DTYPE_float -DRELU");
+    } else if (param.activation_param.active_type ==
+               lite_api::ActivationType::kRelu6) {
+      build_options_.push_back("-DCL_DTYPE_float -DRELU6");
+    } else {
+      build_options_.push_back("-DCL_DTYPE_float");
+    }
+    impl_ = &ConvCompute::Conv2d1x1;
+  } else if (pad_equal) {
+    kernel_func_names_.push_back("im2col");
+    /* TODO(ysh329): CL_OUT_OF_MEMORY when use gemm_batched OpenCL kernel,
+                 use gemm_batched_naive instead.
+    kernel_func_names_.push_back("gemm_batch");
+  */
+    kernel_func_names_.push_back("gemm_batch_naive");
+    kernel_func_paths_.push_back("buffer/im2col_kernel.cl");
+    kernel_func_paths_.push_back("buffer/fc_kernel.cl");
+    build_options_.push_back("-DCL_DTYPE_float");
+    if (relu_fused) {
+      build_options_.push_back("-DCL_DTYPE_float -DRELU");
+    } else if (param.activation_param.active_type ==
+               lite_api::ActivationType::kRelu6) {
+      build_options_.push_back("-DCL_DTYPE_float -DRELU6");
+    } else {
+      build_options_.push_back("-DCL_DTYPE_float");
+    }
+    impl_ = &ConvCompute::GemmlikeConv2d;
+    col_buffer_.reset(new lite::Tensor);
+    col_buffer_->Resize({bs, c_in, kernel_h * kernel_w, h_out * w_out});
+    col_buffer_->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  } else {
+    LOG(FATAL) << "This pad not support ! " << paddings[0] << ", "
+               << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
+  }
+
+  for (size_t i = 0; i < kernel_func_names_.size(); i++) {
+    context.cl_context()->AddKernel(
+        kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
+  }
+}
+
+void ConvCompute::GemmlikeConv2d() {
+  const auto& param = this->Param<param_t>();
+  auto x_dims = param.x->dims();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  int bs = x_dims[0];
+  int c_in = x_dims[1];
+  int h_in = x_dims[2];
+  int w_in = x_dims[3];
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+  int c_out = output_dims[1];
+  int h_out = output_dims[2];
+  int w_out = output_dims[3];
+  int kernel_h = filter_dims[2];
+  int kernel_w = filter_dims[3];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
+  int stride_h = param.strides[0];
+  int stride_w = param.strides[1];
+  int dilation_h = dilations[0];
+  int dilation_w = dilations[1];
+
+  auto* x_buf = param.x->data<float, cl::Buffer>();
+  auto* filter_buf = param.filter->data<float, cl::Buffer>();
+  auto* bias_buf = (param.bias == nullptr)
+                       ? static_cast<cl::Buffer*>(nullptr)
+                       : param.bias->data<float, cl::Buffer>();
+  auto* output_buf =
+      param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  auto* col_buf = col_buffer_->mutable_data<float, cl::Buffer>();
+
+  auto& context = ctx_->As<OpenCLContext>();
+  std::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto img2col_kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+  int n_threads = c_in * h_out * w_out;
+  int in_stride = c_in * h_in * w_in;
+  int out_stride = c_in * kernel_h * kernel_w * h_out * w_out;
+  int img_offset = 0;
+  int col_offset = 0;
+  int arg_idx = 0;
+  cl_int status;
+  for (int b = 0; b < bs; b++) {
+    img_offset = b * in_stride;
+    col_offset = b * out_stride;
+    arg_idx = 0;
+    status = img2col_kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = img2col_kernel.setArg(++arg_idx, img_offset);
+    CL_CHECK_FATAL(status);
+    status = img2col_kernel.setArg(++arg_idx, n_threads);
+    CL_CHECK_FATAL(status);
+    status = img2col_kernel.setArg(++arg_idx, h_in);
+    CL_CHECK_FATAL(status);
+    status = img2col_kernel.setArg(++arg_idx, w_in);
+    CL_CHECK_FATAL(status);
+    status = img2col_kernel.setArg(++arg_idx, kernel_h);
+    CL_CHECK_FATAL(status);
+    status = img2col_kernel.setArg(++arg_idx, kernel_w);
+    CL_CHECK_FATAL(status);
+    status = img2col_kernel.setArg(++arg_idx, pad_h);
+    CL_CHECK_FATAL(status);
+    status = img2col_kernel.setArg(++arg_idx, pad_w);
+    CL_CHECK_FATAL(status);
+    status = img2col_kernel.setArg(++arg_idx, stride_h);
+    CL_CHECK_FATAL(status);
+    status = img2col_kernel.setArg(++arg_idx, stride_w);
+    CL_CHECK_FATAL(status);
+    status = img2col_kernel.setArg(++arg_idx, dilation_h);
+    CL_CHECK_FATAL(status);
+    status = img2col_kernel.setArg(++arg_idx, dilation_w);
+    CL_CHECK_FATAL(status);
+    status = img2col_kernel.setArg(++arg_idx, h_out);
+    CL_CHECK_FATAL(status);
+    status = img2col_kernel.setArg(++arg_idx, w_out);
+    CL_CHECK_FATAL(status);
+    status = img2col_kernel.setArg(++arg_idx, *col_buf);
+    CL_CHECK_FATAL(status);
+    status = img2col_kernel.setArg(++arg_idx, col_offset);
+    CL_CHECK_FATAL(status);
+
+    auto global_work_size = cl::NDRange{static_cast<size_t>(out_stride)};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        img2col_kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        nullptr);
+    CL_CHECK_FATAL(status);
+  }
+
+  int m = c_out;
+  int k = c_in * kernel_h * kernel_w;
+  int n = h_out * w_out;
+  VLOG(4) << "m = " << m << " n = " << n << " k = " << k;
+  kernel_key.str("");
+  kernel_key << kernel_func_names_[1] << build_options_[1];
+  auto gemm_kernel = context.cl_context()->GetKernel(kernel_key.str());
+  GemmBatched(
+      gemm_kernel, col_buf, filter_buf, bias_buf, output_buf, bs, m, n, k);
+}
+
+void ConvCompute::Conv2d1x1() {
+  const auto& param = *param_.get_mutable<param_t>();
+  const int batch_size = param.x->dims()[0];
+  const int k = param.x->dims()[1];  // K: input_channel
+  const int n = param.x->dims()[2] *
+                param.x->dims()[3];       // N == X_HxW == input_h * input_w
+  const int m = param.output->dims()[1];  // M: output_channel == filter number
+
+  VLOG(4) << "m = " << m << " n = " << n << " k = " << k;
+
+  if (param.groups != 1) {
+    LOG(FATAL) << "conv2d_1x1 with group > 1 not supported and param.groups = "
+               << param.groups;
+  }
+
+  auto* x_d = param.x->data<float, cl::Buffer>();
+  auto* filter_d = param.filter->data<float, cl::Buffer>();
+  auto* bias_d = (param.bias == nullptr)
+                     ? static_cast<cl::Buffer*>(nullptr)
+                     : param.bias->data<float, cl::Buffer>();
+  auto* output_d =
+      param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+
+  auto& context = ctx_->As<OpenCLContext>();
+  std::stringstream kernel_key;
+  kernel_key << kernel_func_names_.front() << build_options_.front();
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+  GemmBatched(kernel, x_d, filter_d, bias_d, output_d, batch_size, m, n, k);
+}
+// a: filter_d ==> <m, k> <=> <oc, ic>
+// b: x_d      ==> <k, n> <=> <ic, ih*iw>
+// c: output_d ==> <m, n> <=> <oc, ih*iw>
+void ConvCompute::GemmBatched(cl::Kernel& kernel,
+                              const cl::Buffer* x_d,
+                              const cl::Buffer* filter_d,
+                              const cl::Buffer* bias_d,
+                              cl::Buffer* output_d,
+                              const int batch_size,
+                              const int m,
+                              const int n,
+                              const int k) {
+  /* TODO(ysh329): CL_OUT_OF_MEMORY when use gemm_batch OpenCL kernel,
+                   use gemm_batch_naive instead.
+    auto global_work_size = cl::NDRange{static_cast<size_t>((m + 7) / 8),
+                                        static_cast<size_t>((n + 3) / 4),
+                                        static_cast<size_t>(batch_size)};
+  */
+  auto global_work_size = cl::NDRange{static_cast<size_t>(m),
+                                      static_cast<size_t>(n),
+                                      static_cast<size_t>(batch_size)};
+  auto local_work_size = cl::NDRange{16, 16};  // cl::NullRange;
+
+  auto& context = ctx_->As<OpenCLContext>();
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, *filter_d);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *x_d);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *bias_d);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *output_d);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, m);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, n);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, k);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, batch_size);
+  CL_CHECK_FATAL(status);
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      local_work_size,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+
+  context.cl_wait_list()->emplace(output_d, event_);
+}
+
+void ConvCompute::Run() { (this->*impl_)(); }
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(conv2d,
+                     kOpenCL,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::opencl::ConvCompute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .Finalize();
diff --git a/lite/kernels/opencl/conv_compute.h b/lite/kernels/opencl/conv_buffer_compute.h
similarity index 70%
rename from lite/kernels/opencl/conv_compute.h
rename to lite/kernels/opencl/conv_buffer_compute.h
index 672ba9d223031edf1ebc3d955908c4ab8edc0834..44ada55d92352edf3c64cd653e832b26718cdd2f 100644
--- a/lite/kernels/opencl/conv_compute.h
+++ b/lite/kernels/opencl/conv_buffer_compute.h
@@ -58,34 +58,6 @@ class ConvCompute
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
-class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
-                                           PRECISION(kFloat),
-                                           DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ConvParam;
-  using kernel_t = void (ConvImageCompute::*)();
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
- private:
-  void Conv2d1x1();
-  void Conv2d3x3();
-  void Conv2d5x5();
-  void Conv2d7x7();
-  void DepthwiseConv2d3x3s1();
-  void DepthwiseConv2d3x3();
-  void DepthwiseConv2d();
-
-  kernel_t impl_;
-  std::vector<std::string> kernel_func_names_{};
-  std::vector<std::string> kernel_func_paths_{};
-  std::vector<std::string> build_options_{};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-  Tensor filter_gpu_image_;
-  Tensor bias_gpu_image_;
-};
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/opencl/conv_compute_test.cc b/lite/kernels/opencl/conv_buffer_compute_test.cc
similarity index 99%
rename from lite/kernels/opencl/conv_compute_test.cc
rename to lite/kernels/opencl/conv_buffer_compute_test.cc
index af59873336fb154b34d7ada398d7fe8e568e7655..2060bd1f83c39fc06c4896e1fdce295cedc37675 100644
--- a/lite/kernels/opencl/conv_compute_test.cc
+++ b/lite/kernels/opencl/conv_buffer_compute_test.cc
@@ -167,9 +167,8 @@ void PrintData(std::string name,
 }
 
 // buffer
-#if 0
 // #define PRINT_RESULT
-#define LOOP_TEST
+// #define LOOP_TEST
 TEST(conv2d, compute_conv2d_1x1) {
   // conv2d 1x1 note
   // kernel/filter size = 1x1, group = 1, pad = 0, stride = 1, dilation = 1
@@ -200,7 +199,7 @@ TEST(conv2d, compute_conv2d_1x1) {
   // output_dims:1 64 112 112
   // filter_dims:64 32 1 1
   const bool bias_flag = true;
-  const bool relu_flag = true;
+  const std::string relu_flag = "relu";
   const int batch_size = 8;
   const int oc = 64;
   const int ih = 112;
@@ -625,9 +624,8 @@ TEST(conv2d, compute_conv2d_gemm) {
   }              // batch_size
 #endif
 }
-#endif
 
 }  // namespace lite
 }  // namespace paddle
 
-// USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def);
+USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kNCHW, def);
diff --git a/lite/kernels/opencl/conv_compute.cc b/lite/kernels/opencl/conv_image_compute.cc
similarity index 54%
rename from lite/kernels/opencl/conv_compute.cc
rename to lite/kernels/opencl/conv_image_compute.cc
index d00101552d4376bc4ac2a176016c1a9a449c35a7..a409690a2e37750c88395a592565b9e968e62845 100644
--- a/lite/kernels/opencl/conv_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/opencl/conv_compute.h"
+#include "lite/kernels/opencl/conv_image_compute.h"
 
+#include <iomanip>
 #include <sstream>
-
 #include "lite/backends/opencl/cl_image_converter.h"
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/op_registry.h"
@@ -27,276 +27,6 @@ namespace lite {
 namespace kernels {
 namespace opencl {
 
-void ConvCompute::PrepareForRun() {
-  const auto& param = this->Param<param_t>();
-  auto x_dims = param.x->dims();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-
-  int bs = x_dims[0];
-  int c_in = x_dims[1];
-  int h_out = output_dims[2];
-  int w_out = output_dims[3];
-  int kernel_h = filter_dims[2];  // oihw
-  int kernel_w = filter_dims[3];
-  auto paddings = *param.paddings;
-  auto dilations = *param.dilations;
-  int stride_h = param.strides[0];
-  int stride_w = param.strides[1];
-  int pad_h = paddings[0];
-  int pad_w = paddings[2];
-  int groups = param.groups;
-  bool relu_fused = param.fuse_relu;
-  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
-  bool zero_pad = (pad_h == 0) && (pad_w == 0);
-
-  bool pad_equal =
-      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
-
-  VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
-  VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
-          << " stride_w:" << stride_w << " pad_h:" << pad_h
-          << " pad_w:" << pad_w << " kernel_h:" << kernel_h
-          << " kernel_h:" << kernel_h;
-  VLOG(3) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
-          << " " << x_dims[3];
-  VLOG(3) << "output_dims:" << output_dims[0] << " " << output_dims[1] << " "
-          << output_dims[2] << " " << output_dims[3];
-  VLOG(3) << "filter_dims:" << filter_dims[0] << " " << filter_dims[1] << " "
-          << filter_dims[2] << " " << filter_dims[3];
-
-  if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 &&
-      zero_pad && no_dilation && pad_equal) {
-    // conv2d_1x1
-    kernel_func_names_.push_back("gemm_batch");
-    kernel_func_paths_.push_back("buffer/fc_kernel.cl");
-    if (relu_fused) {
-      build_options_.push_back("-DCL_DTYPE_float -DRELU");
-    } else if (param.activation_param.active_type ==
-               lite_api::ActivationType::kRelu6) {
-      build_options_.push_back("-DCL_DTYPE_float -DRELU6");
-    } else {
-      build_options_.push_back("-DCL_DTYPE_float");
-    }
-    impl_ = &ConvCompute::Conv2d1x1;
-  } else if (pad_equal) {
-    kernel_func_names_.push_back("im2col");
-    kernel_func_names_.push_back("gemm_batch");
-    kernel_func_paths_.push_back("buffer/im2col_kernel.cl");
-    kernel_func_paths_.push_back("buffer/fc_kernel.cl");
-    build_options_.push_back("-DCL_DTYPE_float");
-    if (relu_fused) {
-      build_options_.push_back("-DCL_DTYPE_float -DRELU");
-    } else if (param.activation_param.active_type ==
-               lite_api::ActivationType::kRelu6) {
-      build_options_.push_back("-DCL_DTYPE_float -DRELU6");
-    } else {
-      build_options_.push_back("-DCL_DTYPE_float");
-    }
-    impl_ = &ConvCompute::GemmlikeConv2d;
-    col_buffer_.reset(new lite::Tensor);
-    col_buffer_->Resize({bs, c_in, kernel_h * kernel_w, h_out * w_out});
-    col_buffer_->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  } else {
-    LOG(FATAL) << "This pad not support ! " << paddings[0] << ", "
-               << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
-  }
-
-  for (size_t i = 0; i < kernel_func_names_.size(); i++) {
-    context.cl_context()->AddKernel(
-        kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
-  }
-}
-
-void ConvCompute::GemmlikeConv2d() {
-  const auto& param = this->Param<param_t>();
-  auto x_dims = param.x->dims();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-  int bs = x_dims[0];
-  int c_in = x_dims[1];
-  int h_in = x_dims[2];
-  int w_in = x_dims[3];
-  auto paddings = *param.paddings;
-  auto dilations = *param.dilations;
-  int c_out = output_dims[1];
-  int h_out = output_dims[2];
-  int w_out = output_dims[3];
-  int kernel_h = filter_dims[2];
-  int kernel_w = filter_dims[3];
-  int pad_h = paddings[0];
-  int pad_w = paddings[2];
-  int stride_h = param.strides[0];
-  int stride_w = param.strides[1];
-  int dilation_h = dilations[0];
-  int dilation_w = dilations[1];
-
-  auto* x_buf = param.x->data<float, cl::Buffer>();
-  auto* filter_buf = param.filter->data<float, cl::Buffer>();
-  auto* bias_buf = (param.bias == nullptr)
-                       ? static_cast<cl::Buffer*>(nullptr)
-                       : param.bias->data<float, cl::Buffer>();
-  auto* output_buf =
-      param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  auto* col_buf = col_buffer_->mutable_data<float, cl::Buffer>();
-
-  auto& context = ctx_->As<OpenCLContext>();
-  std::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto img2col_kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-  int n_threads = c_in * h_out * w_out;
-  int in_stride = c_in * h_in * w_in;
-  int out_stride = c_in * kernel_h * kernel_w * h_out * w_out;
-  int img_offset = 0;
-  int col_offset = 0;
-  int arg_idx = 0;
-  cl_int status;
-  for (int b = 0; b < bs; b++) {
-    img_offset = b * in_stride;
-    col_offset = b * out_stride;
-    arg_idx = 0;
-    status = img2col_kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, img_offset);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, n_threads);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, h_in);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, w_in);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, kernel_h);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, kernel_w);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, pad_h);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, pad_w);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, stride_h);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, stride_w);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, dilation_h);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, dilation_w);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, h_out);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, w_out);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, *col_buf);
-    CL_CHECK_FATAL(status);
-    status = img2col_kernel.setArg(++arg_idx, col_offset);
-    CL_CHECK_FATAL(status);
-
-    auto global_work_size = cl::NDRange{static_cast<size_t>(out_stride)};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        img2col_kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        nullptr);
-    CL_CHECK_FATAL(status);
-  }
-
-  int m = c_out;
-  int k = c_in * kernel_h * kernel_w;
-  int n = h_out * w_out;
-  VLOG(4) << "m = " << m << " n = " << n << " k = " << k;
-  kernel_key.str("");
-  kernel_key << kernel_func_names_[1] << build_options_[1];
-  auto gemm_kernel = context.cl_context()->GetKernel(kernel_key.str());
-  GemmBatched(
-      gemm_kernel, col_buf, filter_buf, bias_buf, output_buf, bs, m, n, k);
-}
-
-void ConvCompute::Conv2d1x1() {
-  const auto& param = *param_.get_mutable<param_t>();
-  const int batch_size = param.x->dims()[0];
-  const int k = param.x->dims()[1];  // K: input_channel
-  const int n = param.x->dims()[2] *
-                param.x->dims()[3];       // N == X_HxW == input_h * input_w
-  const int m = param.output->dims()[1];  // M: output_channel == filter number
-
-  VLOG(4) << "m = " << m << " n = " << n << " k = " << k;
-
-  if (param.groups != 1) {
-    LOG(FATAL) << "conv2d_1x1 with group > 1 not supported and param.groups = "
-               << param.groups;
-  }
-
-  auto* x_d = param.x->data<float, cl::Buffer>();
-  auto* filter_d = param.filter->data<float, cl::Buffer>();
-  auto* bias_d = (param.bias == nullptr)
-                     ? static_cast<cl::Buffer*>(nullptr)
-                     : param.bias->data<float, cl::Buffer>();
-  auto* output_d =
-      param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-
-  auto& context = ctx_->As<OpenCLContext>();
-  std::stringstream kernel_key;
-  kernel_key << kernel_func_names_.front() << build_options_.front();
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-  GemmBatched(kernel, x_d, filter_d, bias_d, output_d, batch_size, m, n, k);
-}
-// a: filter_d ==> <m, k> <=> <oc, ic>
-// b: x_d      ==> <k, n> <=> <ic, ih*iw>
-// c: output_d ==> <m, n> <=> <oc, ih*iw>
-void ConvCompute::GemmBatched(cl::Kernel& kernel,
-                              const cl::Buffer* x_d,
-                              const cl::Buffer* filter_d,
-                              const cl::Buffer* bias_d,
-                              cl::Buffer* output_d,
-                              const int batch_size,
-                              const int m,
-                              const int n,
-                              const int k) {
-  auto global_work_size = cl::NDRange{static_cast<size_t>((m + 7) / 8),
-                                      static_cast<size_t>((n + 3) / 4),
-                                      static_cast<size_t>(batch_size)};
-  auto local_work_size = cl::NDRange{16, 16};  // cl::NullRange;
-
-  auto& context = ctx_->As<OpenCLContext>();
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, *filter_d);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *x_d);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *bias_d);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *output_d);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, m);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, n);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, k);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, batch_size);
-  CL_CHECK_FATAL(status);
-
-  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-      kernel,
-      cl::NullRange,
-      global_work_size,
-      local_work_size,
-      nullptr,
-      event_.get());
-  CL_CHECK_FATAL(status);
-
-  context.cl_wait_list()->emplace(output_d, event_);
-}
-
-void ConvCompute::Run() { (this->*impl_)(); }
-
 /* image kernel*/
 void ConvImageCompute::PrepareForRun() {
   const auto& param = this->Param<param_t>();
@@ -331,8 +61,6 @@ void ConvImageCompute::PrepareForRun() {
   bool stride_equal = stride_h == stride_w;
   bool dilation_equal = dilations[0] == dilations[1];
 
-  CHECK(pad_equal && stride_equal && dilation_equal);
-
   VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
   VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
           << " stride_w:" << stride_w << " pad_h:" << pad_h
@@ -340,115 +68,273 @@ void ConvImageCompute::PrepareForRun() {
           << " kernel_h:" << kernel_h;
   VLOG(3) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
           << " " << x_dims[3];
+  VLOG(3) << "dialtion:" << dilations[0] << " " << dilations[1];
   VLOG(3) << "output_dims:" << output_dims[0] << " " << output_dims[1] << " "
           << output_dims[2] << " " << output_dims[3];
   VLOG(3) << "filter_dims:" << filter_dims[0] << " " << filter_dims[1] << " "
           << filter_dims[2] << " " << filter_dims[3];
+  VLOG(3) << "pad_equal:" << pad_equal;
+  VLOG(3) << "stride_equal:" << stride_equal;
+  VLOG(3) << "dilation_equal:" << dilation_equal;
+  VLOG(3) << "padding :" << paddings[0] << " " << paddings[1] << " "
+          << paddings[2] << " " << paddings[3];
+  CHECK(pad_equal && stride_equal && dilation_equal);
+
+  // general gws..
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+
+  default_c_blk_ = default_work_size[0];
+  default_w_blk_ = default_work_size[1];
+  default_nh_blk_ = default_work_size[2];
+  c_blk_ = default_c_blk_;
+  w_blk_ = default_w_blk_;
+  nh_blk_ = default_nh_blk_;
+  global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                  static_cast<size_t>(w_blk_),
+                                  static_cast<size_t>(nh_blk_)};
+
   if (kernel_h == 1 && kernel_w == 1) {
     // conv2d_1x1
     if (param.x->dims()[1] % 4 == 0) {
       kernel_func_names_.push_back("conv2d_1x1_simple");
     } else {
-      kernel_func_names_.push_back("conv2d_1x1");
+      kernel_func_names_.push_back("conv2d_1x1_opt");
     }
-    kernel_func_paths_.push_back("image/conv2d_1x1_kernel.cl");
+    kernel_func_paths_.push_back("image/conv2d_1x1_opt_kernel.cl");
 
     CLImageConverterNWBlock converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    std::vector<float> filter_image_v(filter_image_dims[0] *
-                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    std::vector<half_t> filter_image_v(filter_image_dims[0] *
+                                       filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
-    impl_ = &ConvImageCompute::Conv2d1x1;
-#if 1  // TODO(ysh329): enable general dwconv
-  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1]) {
-#else  // TODO(ysh329): remove dwconv3x3s1 and dwconv3x3 temporarily, need fix
+    impl_ = &ConvImageCompute::Conv2d1x1opt;
+    {
+      // calc 1x1 gws
+      w_blk_ = maptofactor(default_w_blk_, 4);
+      c_blk_ = default_c_blk_;
+      nh_blk_ = default_nh_blk_;
+      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                      static_cast<size_t>(w_blk_),
+                                      static_cast<size_t>(nh_blk_)};
+    }
+#define DEPTH_CONV_USE_SPL
+#ifdef DEPTH_CONV_USE_SPL
   } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
              kernel_h == 3 && kernel_w == 3 && groups > 1) {
     // depth_conv2d_3x3s1, depth_conv2d_3x3
     if (stride_h == 1 && dilations[0] == 1) {
       kernel_func_names_.push_back("depth_conv2d_3x3s1");
       impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1;
+      {
+        // depthwise spl gws s1
+        int c_block = (output_dims[1] + 3) / 4;
+        int w = output_dims[3];
+        int nh = output_dims[0] * output_dims[2];
+        int w_blk_size = 2;
+        int w_blk = (w + w_blk_size - 1) / w_blk_size;
+
+        c_blk_ = c_block;
+        w_blk_ = w_blk;
+        nh_blk_ = nh;
+        global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                        static_cast<size_t>(w_blk_),
+                                        static_cast<size_t>(nh_blk_)};
+      }
     } else {
       kernel_func_names_.push_back("depth_conv2d_3x3");
       impl_ = &ConvImageCompute::DepthwiseConv2d3x3;
+      {
+        // depthwise spl gws
+        int c_block = (output_dims[1] + 3) / 4;
+        int w = output_dims[3];
+        int nh = output_dims[0] * output_dims[2];
+
+        c_blk_ = c_block;
+        w_blk_ = w;
+        nh_blk_ = nh;
+
+        global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                        static_cast<size_t>(w_blk_),
+                                        static_cast<size_t>(nh_blk_)};
+      }
     }
     kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl");
 
     CLImageConverterNWBlock converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    std::vector<float> filter_image_v(filter_image_dims[0] *
-                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    std::vector<half_t> filter_image_v(filter_image_dims[0] *
+                                       filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
-  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
-             kernel_h != 3) {
+
 #endif
+  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1]
+#ifdef DEPTH_CONV_USE_SPL
+             &&
+             kernel_h != 3
+#endif
+#undef DEPTH_CONV_USE_SPL
+             ) {
     // depth_conv2d
     kernel_func_names_.push_back("depth_conv2d");
     kernel_func_paths_.push_back("image/depthwise_conv2d_basic_kernel.cl");
 
     CLImageConverterNWBlock converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    std::vector<float> filter_image_v(filter_image_dims[0] *
-                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    std::vector<half_t> filter_image_v(filter_image_dims[0] *
+                                       filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::DepthwiseConv2d;
-  } else if (kernel_h == 3 && kernel_h == 3) {
+  } else if (kernel_w == 3 && kernel_h == 3) {
     // conv2d_3x3
-    kernel_func_names_.push_back("conv2d_3x3");
-    kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
+    kernel_func_names_.push_back(bs > 1 ? "conv2d_3x3_multi_batch"
+                                        : "conv2d_3x3_opt");
+    kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl");
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    std::vector<float> filter_image_v(filter_image_dims[0] *
-                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    std::vector<half_t> filter_image_v(filter_image_dims[0] *
+                                       filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
-    impl_ = &ConvImageCompute::Conv2d3x3;
+    impl_ = &ConvImageCompute::Conv2d3x3opt;
+    {
+      int w_blk_size = 5;
+      int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
+
+      int h_blk_size = 1;
+      int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
+
+      c_blk_ = default_c_blk_;
+      w_blk_ = w_blk;
+      nh_blk_ = h_blk;
+
+      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                      static_cast<size_t>(w_blk_),
+                                      static_cast<size_t>(nh_blk_)};
+    }
+
   } else if (kernel_h == 5 && kernel_w == 5) {
+#define CONV_5x5_OPT
+#ifndef CONV_5x5_OPT
     // conv2d_5x5
     kernel_func_names_.push_back("conv2d_5x5");
     kernel_func_paths_.push_back("image/conv2d_5x5_kernel.cl");
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    std::vector<float> filter_image_v(filter_image_dims[0] *
-                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    std::vector<half_t> filter_image_v(filter_image_dims[0] *
+                                       filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::Conv2d5x5;
+#else
+    // conv2d_5x5_opt
+
+    kernel_func_names_.push_back(bs > 1 ? "conv2d_5x5_multi_batch"
+                                        : "conv2d_5x5_opt");
+    kernel_func_paths_.push_back("image/conv2d_5x5_opt_kernel.cl");
+
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<half_t> filter_image_v(filter_image_dims[0] *
+                                       filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+
+    impl_ = &ConvImageCompute::Conv2d5x5opt;
+    {
+      int w_blk_size = 5;
+      int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
+
+      int h_blk_size = 1;
+      int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
+
+      c_blk_ = default_c_blk_;
+      w_blk_ = w_blk;
+      nh_blk_ = h_blk;
+
+      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                      static_cast<size_t>(w_blk_),
+                                      static_cast<size_t>(nh_blk_)};
+    }
+#endif
+#undef CONV_5x5_OPT
   } else if (kernel_h == 7 && kernel_w == 7) {
+#define CONV_7x7_OPT
+#ifndef CONV_7x7_OPT
     // conv2d_7x7
     kernel_func_names_.push_back("conv2d_7x7");
     kernel_func_paths_.push_back("image/conv2d_7x7_kernel.cl");
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    std::vector<float> filter_image_v(filter_image_dims[0] *
-                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    std::vector<half_t> filter_image_v(filter_image_dims[0] *
+                                       filter_image_dims[1] * 4);  // 4 : RGBA
     converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
-    this->filter_gpu_image_.mutable_data<float, cl::Image2D>(
+    this->filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
         filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
 
     impl_ = &ConvImageCompute::Conv2d7x7;
+
+#else
+    // conv2d_7x7
+    kernel_func_names_.push_back(bs > 1 ? "conv2d_7x7_multi_batch"
+                                        : "conv2d_7x7_opt");
+    kernel_func_paths_.push_back("image/conv2d_7x7_opt_kernel.cl");
+
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<half_t> filter_image_v(filter_image_dims[0] *
+                                       filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    this->filter_gpu_image_.mutable_data<half_t, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+
+    impl_ = &ConvImageCompute::Conv2d7x7opt;
+    {
+      int w_blk_size = 5;
+      int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
+
+      int h_blk_size = 1;
+      int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
+
+      c_blk_ = default_c_blk_;
+      w_blk_ = w_blk;
+      nh_blk_ = h_blk;
+
+      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                      static_cast<size_t>(w_blk_),
+                                      static_cast<size_t>(nh_blk_)};
+    }
+#endif
+#undef CONV_7x7_OPT
+
   } else {
     LOG(FATAL) << "conv image compute not support this condition yet! ";
   }
   VLOG(1) << "kernel_func_names_[0]:" << kernel_func_names_[0]
           << " kernel_func_paths_[0]:" << kernel_func_paths_[0];
 
-  std::string build_options_single(" -DCL_DTYPE_float");
+  std::string build_options_single(" -DCL_DTYPE_half");
   // relu options
   if (relu_fused) {
     build_options_single += " -DRELU";
@@ -456,7 +342,7 @@ void ConvImageCompute::PrepareForRun() {
              lite_api::ActivationType::kRelu6) {
     build_options_single += " -DRELU6";
   } else {
-    // do nothing
+    // do nothing, may add more activation fuse
   }
   // bias options
   const bool has_bias = param.bias != nullptr;
@@ -470,31 +356,551 @@ void ConvImageCompute::PrepareForRun() {
     CLImageConverterFolder bias_converter;
     const DDim& bias_image_dims =
         bias_converter.InitImageDimInfoWith(param.bias->dims());
-    std::vector<float> bias_image_v(bias_image_dims[0] * bias_image_dims[1] *
-                                    4);
+    std::vector<half_t> bias_image_v(bias_image_dims[0] * bias_image_dims[1] *
+                                     4);
     float* bias_cpu_data = param.bias->mutable_data<float>();
     bias_converter.NCHWToImage(
         bias_cpu_data, bias_image_v.data(), param.bias->dims());
-    this->bias_gpu_image_.mutable_data<float, cl::Image2D>(
+    this->bias_gpu_image_.mutable_data<half_t, cl::Image2D>(
         bias_image_dims[0], bias_image_dims[1], bias_image_v.data());
     // convert cpu buffer bias --> gpu image --- end ----
   }
 
-  build_options_.push_back(build_options_single);
+  build_options_.push_back(build_options_single);
+
+  for (size_t i = 0; i < kernel_func_names_.size(); i++) {
+    context.cl_context()->AddKernel(
+        kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
+  }
+
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+
+  std::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  kernel_ = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  size_t max_work_group_size = 0;
+  kernel_.getWorkGroupInfo<size_t>(CLRuntime::Global()->device(),
+                                   CL_KERNEL_WORK_GROUP_SIZE,
+                                   &max_work_group_size);
+
+  VLOG(4) << "max_work_group_size: " << max_work_group_size;
+
+  if (max_work_group_size > 0 && use_lws) {
+    // local_work_size_ = context.cl_context()->LocalWorkSizeConv1x1(
+    //     global_work_size_, max_work_group_size);
+    local_work_size_ = context.cl_context()->LocalWorkSize(global_work_size_,
+                                                           max_work_group_size);
+
+    VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << ","
+            << local_work_size_[1] << "," << local_work_size_[2] << "}";
+  }
+}
+
+void ConvImageCompute::Conv2d1x1opt() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+
+// const std::vector<size_t>& default_work_size =
+//     DefaultWorkSize(output_dims,
+//                     DDim(std::vector<DDim::value_type>{
+//                         static_cast<int64_t>(out_image_shape["width"]),
+//                         static_cast<int64_t>(out_image_shape["height"])}));
+
+// int c_block = default_work_size[0];
+// int w = default_work_size[1];
+// int nh = default_work_size[2];
+
+// int maped_w = maptofactor(w, 4);
+
+// auto global_work_size_ =
+//     cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+//                 static_cast<size_t>(maped_w),
+//                 static_cast<size_t>(default_work_size.data()[2])};
+
+#ifndef LITE_SHUTDOWN_LOG
+  //  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+#endif
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "============ conv2d_1x1 params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  //  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  //  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+// VLOG(4) << "default work size{c_block, w, nh}: "
+//         << "{" << c_block << ", " << w << ", " << nh << ""
+//         << "}";
+#endif
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  // handle bias  use buffer for channel wise , use image for element wise
+  const cl::Buffer* bias_buf = nullptr;
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+  }
+
+  auto kernel = kernel_;
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_blk_);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w_blk_);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh_blk_);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, default_w_blk_);
+  CL_CHECK_FATAL(status);
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size_,
+      local_work_size_,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+
+#ifdef PROFILE_CONV_KERNEL
+  bool use_profile = false;
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+  double start = GetCurrentUS();
+
+  if (use_profile) {
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size_,
+        local_work_size_,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_image, event_);
+  } else {
+    int count = 50;
+    double sumtime = 0;
+    if (!use_profile) {
+      count = 1;
+    }
+    for (size_t i = 0; i < count; i++) {
+      start = GetCurrentUS();
+      status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+          kernel,
+          cl::NullRange,
+          global_work_size_,
+          local_work_size_,
+          nullptr,
+          event_.get());
+      CL_CHECK_FATAL(status);
+      context.cl_wait_list()->emplace(out_image, event_);
+      if (use_profile) {
+        event_->wait();
+        double duration = GetCurrentUS() - start;
+        sumtime += duration;
+      }
+    }
+
+    auto dims_string = [](DDimLite dims) -> std::string {
+      std::ostringstream stream;
+      stream << "[" << dims[0] << "," << dims[1] << "," << dims[2] << ","
+             << dims[3] << "]";
+      return stream.str();
+    };
+    if (use_profile) {
+      // LOG(INFO) << "input: " << input_dims;
+      // LOG(INFO) << "filter: " << filter_dims;
+      // LOG(INFO) << "output: " << output_dims;
+
+      std::cout << std::setw(25) << std::left << dims_string(input_dims)
+                << std::setw(25) << std::left << dims_string(filter_dims)
+                << std::setw(25) << std::left << dims_string(output_dims)
+                << std::setw(25) << std::left << sumtime / count << std::endl;
+    } else {
+      dims_string(input_dims);
+    }
+  }
+#endif
+}
+
+void ConvImageCompute::Conv2d3x3() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+
+  auto* input_image = param.x->data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int input_channel = input_dims[1];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int output_channel = output_dims[1];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
+  int filter_channel = filter_dims[1];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+
+  // re-calc group
+  int new_groups{param.groups};
+  if (filter_dims[0] == output_dims[1] && filter_dims[1] == input_dims[1]) {
+    new_groups = 1;
+  } else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) {
+    new_groups = input_channel / filter_channel;
+  }
+/* TODO(ysh329): mobile has no case below
+   else {
+    LOG(FATAL) << "Not support conv3x3 case with"
+               << " input_dims:" << input_dims << " output_dims:" <<
+  output_dims
+               << " filter_dims:" << filter_dims;
+  }
+*/
+
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "============ conv2d params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  //  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  //  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "param.groups(groups):" << param.groups;
+  VLOG(4) << "new_groups:" << new_groups;
+// VLOG(4) << "default work size{c_block, w, nh}: "
+//         << "{" << c_block << ", " << w << ", " << nh << ""
+//         << "}";
+#endif
+
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+  }
+  auto kernel = kernel_;
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_blk_);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w_blk_);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh_blk_);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "set bias_image: ";
+#endif
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_channel);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_channel);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, filter_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, new_groups);
+  CL_CHECK_FATAL(status);
+
+#ifndef LITE_SHUTDOWN_LOG
+  //  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+#endif
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size_,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+
+void ConvImageCompute::Conv2d3x3opt() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto dilations = *param.dilations;
+
+  auto* input_image = param.x->data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int input_channel = input_dims[1];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int output_channel = output_dims[1];
+  CHECK_EQ(input_dims[0], output_dims[0]);
+  int batch = input_dims[0];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "============ conv2d params ============";
+  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+  //         << input_image_shape["height"];
+  //  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  //  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+#endif
+
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+  }
+
+  auto kernel = kernel_;
+
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_blk_);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w_blk_);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh_blk_);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "set bias_image: ";
+#endif
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+
+  status = kernel.setArg(++arg_idx, paddings[0]);
+  CL_CHECK_FATAL(status);
 
-  for (size_t i = 0; i < kernel_func_names_.size(); i++) {
-    context.cl_context()->AddKernel(
-        kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
-  }
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, batch);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_channel);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+
+#ifndef LITE_SHUTDOWN_LOG
+  //  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+#endif
+
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size_,
+      local_work_size_,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
 }
 
-void ConvImageCompute::Conv2d1x1() {
+void ConvImageCompute::Conv2d5x5() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
   auto input_dims = param.x->dims();
   auto paddings = *param.paddings;
   auto strides = param.strides;
-  auto* input_image = param.x->data<float, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto* input_image = param.x->data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -502,8 +908,10 @@ void ConvImageCompute::Conv2d1x1() {
   int input_height = input_dims[2];
   int output_width = output_dims[3];
   int output_height = output_dims[2];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
   auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
       out_image_shape["width"], out_image_shape["height"]);
 
   const bool has_bias = param.bias != nullptr;
@@ -518,24 +926,16 @@ void ConvImageCompute::Conv2d1x1() {
   int input_c = input_dims[1];
   auto dilations = *param.dilations;
 
-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
-  VLOG(4) << "============ conv2d_1x1 params ============";
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "============ conv2d params ============";
   VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
           << input_image_shape["height"];
   VLOG(4) << "input_c_block: " << input_c_block;
   VLOG(4) << "input_c: " << input_c;
-  VLOG(4) << "input_image: " << input_image;
+  //  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
   VLOG(4) << "filter_dims: " << filter_dims;
-  VLOG(4) << "filter_image: " << filter_image;
+  //  VLOG(4) << "filter_image: " << filter_image;
   VLOG(4) << "output_dims: " << output_dims;
   VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
           << out_image_shape["height"];
@@ -546,9 +946,7 @@ void ConvImageCompute::Conv2d1x1() {
   VLOG(4) << "offset: " << offset;
   VLOG(4) << "dilations.size : " << dilations.size();
   VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
+#endif
 
   CHECK_GE(dilations.size(), 2);
   CHECK(dilations[0] == dilations[1]);
@@ -558,38 +956,29 @@ void ConvImageCompute::Conv2d1x1() {
   CHECK_GE(strides.size(), 2);
   CHECK(strides[0] == strides[1]);
 
-  // handle bias  use buffer for channel wise , use image for element wise
-  const cl::Buffer* bias_buf = nullptr;
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
   }
 
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  std::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-  int maped_w = maptofactor(w, 4);
-
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-  VLOG(4) << "maped_w: " << maped_w;
-  VLOG(4) << "hasbias: " << has_bias;
+  auto kernel = kernel_;
 
   cl_int status;
   int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, maped_w);
+  status = kernel.setArg(++arg_idx, w_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh);
+  status = kernel.setArg(++arg_idx, nh_blk_);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *input_image);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *filter_image);
   CL_CHECK_FATAL(status);
   if (has_bias) {
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "set bias_image: ";
+#endif
     status = kernel.setArg(++arg_idx, *bias_image);
     CL_CHECK_FATAL(status);
   }
@@ -602,8 +991,7 @@ void ConvImageCompute::Conv2d1x1() {
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, input_c_block);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_c);
-  CL_CHECK_FATAL(status);
+
   status = kernel.setArg(++arg_idx, dilations[0]);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, input_width);
@@ -614,22 +1002,17 @@ void ConvImageCompute::Conv2d1x1() {
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, output_height);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w);
-  CL_CHECK_FATAL(status);
-
-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(maped_w),
-                  static_cast<size_t>(default_work_size.data()[2])};
 
-  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
+#ifndef LITE_SHUTDOWN_LOG
+  //  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+#endif
 
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
+      global_work_size_,
       cl::NullRange,
       nullptr,
       event_.get());
@@ -637,14 +1020,17 @@ void ConvImageCompute::Conv2d1x1() {
   context.cl_wait_list()->emplace(out_image, event_);
 }
 
-void ConvImageCompute::Conv2d3x3() {
+void ConvImageCompute::Conv2d5x5opt() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
   auto input_dims = param.x->dims();
   auto paddings = *param.paddings;
   auto strides = param.strides;
+  auto dilations = *param.dilations;
 
-  auto* input_image = param.x->data<float, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto* input_image = param.x->data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -654,60 +1040,26 @@ void ConvImageCompute::Conv2d3x3() {
   int output_width = output_dims[3];
   int output_height = output_dims[2];
   int output_channel = output_dims[1];
-  int filter_width = filter_dims[3];
-  int filter_height = filter_dims[2];
-  int filter_channel = filter_dims[1];
+  CHECK_EQ(input_dims[0], output_dims[0]);
+  int batch = input_dims[0];
+
   auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
       out_image_shape["width"], out_image_shape["height"]);
 
   const bool has_bias = param.bias != nullptr;
   const bool is_element_wise_bias =
       has_bias && param.output->dims() == param.bias->dims();
-  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
-               static_cast<int>(paddings[0]);
-
-  // calc input_c_block
-  auto input_image_shape = InitImageDimInfoWith(input_dims);
-  int input_c_block = input_image_shape["width"] / input_dims[3];
-  int input_c = input_dims[1];
-  auto dilations = *param.dilations;
-
-  // re-calc group
-  int new_groups{param.groups};
-  if (filter_dims[0] == output_dims[1] && filter_dims[1] == input_dims[1]) {
-    new_groups = 1;
-  } else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) {
-    new_groups = input_channel / filter_channel;
-  }
-  /* TODO(ysh329): mobile has no case below
-     else {
-      LOG(FATAL) << "Not support conv3x3 case with"
-                 << " input_dims:" << input_dims << " output_dims:" <<
-    output_dims
-                 << " filter_dims:" << filter_dims;
-    }
-  */
-
-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
 
+// default_work_size[2] = h_blk;
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "============ conv2d params ============";
-  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-          << input_image_shape["height"];
-  VLOG(4) << "input_c_block: " << input_c_block;
-  VLOG(4) << "input_c: " << input_c;
-  VLOG(4) << "input_image: " << input_image;
+  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+  //         << input_image_shape["height"];
+  //  VLOG(4) << "input_image: " << input_image;
   VLOG(4) << "input_dims: " << input_dims;
   VLOG(4) << "filter_dims: " << filter_dims;
-  VLOG(4) << "filter_image: " << filter_image;
+  //  VLOG(4) << "filter_image: " << filter_image;
   VLOG(4) << "output_dims: " << output_dims;
   VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
           << out_image_shape["height"];
@@ -715,15 +1067,9 @@ void ConvImageCompute::Conv2d3x3() {
   VLOG(4) << "has bias: " << has_bias;
   VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
   VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  VLOG(4) << "offset: " << offset;
   VLOG(4) << "dilations.size : " << dilations.size();
   VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "param.groups(groups):" << param.groups;
-  VLOG(4) << "new_groups:" << new_groups;
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
-
+#endif
   CHECK_GE(dilations.size(), 2);
   CHECK(dilations[0] == dilations[1]);
   CHECK_GE(input_dims.size(), 4);
@@ -734,32 +1080,23 @@ void ConvImageCompute::Conv2d3x3() {
 
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
   }
 
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-  VLOG(4) << "w: " << w;
-
+  auto kernel = kernel_;
   cl_int status;
   int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w);
+  status = kernel.setArg(++arg_idx, w_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh);
+  status = kernel.setArg(++arg_idx, nh_blk_);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *input_image);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *filter_image);
   CL_CHECK_FATAL(status);
   if (has_bias) {
-    VLOG(4) << "set bias_image: ";
     status = kernel.setArg(++arg_idx, *bias_image);
     CL_CHECK_FATAL(status);
   }
@@ -768,13 +1105,15 @@ void ConvImageCompute::Conv2d3x3() {
   status = kernel.setArg(++arg_idx, strides[0]);
   CL_CHECK_FATAL(status);
 
-  status = kernel.setArg(++arg_idx, offset);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_c_block);
+  status = kernel.setArg(++arg_idx, paddings[0]);
   CL_CHECK_FATAL(status);
 
   status = kernel.setArg(++arg_idx, dilations[0]);
   CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, batch);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_channel);
+  CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, input_width);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, input_height);
@@ -783,44 +1122,29 @@ void ConvImageCompute::Conv2d3x3() {
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, output_height);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_channel);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, filter_channel);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, filter_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, filter_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, new_groups);
-  CL_CHECK_FATAL(status);
 
-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(default_work_size.data()[1]),
-                  static_cast<size_t>(default_work_size.data()[2])};
-
-  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
+  //  VLOG(4) << "out_image: " << out_image;
 
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
-      cl::NullRange,
+      global_work_size_,
+      local_work_size_,
       nullptr,
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(out_image, event_);
 }
 
-void ConvImageCompute::Conv2d5x5() {
+void ConvImageCompute::Conv2d7x7() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
   auto input_dims = param.x->dims();
   auto paddings = *param.paddings;
   auto strides = param.strides;
-  auto* input_image = param.x->data<float, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto* input_image = param.x->data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -831,7 +1155,7 @@ void ConvImageCompute::Conv2d5x5() {
   int filter_width = filter_dims[3];
   int filter_height = filter_dims[2];
   auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
       out_image_shape["width"], out_image_shape["height"]);
 
   const bool has_bias = param.bias != nullptr;
@@ -846,25 +1170,16 @@ void ConvImageCompute::Conv2d5x5() {
   int input_c = input_dims[1];
   auto dilations = *param.dilations;
 
-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "============ conv2d params ============";
   VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
           << input_image_shape["height"];
   VLOG(4) << "input_c_block: " << input_c_block;
   VLOG(4) << "input_c: " << input_c;
-  VLOG(4) << "input_image: " << input_image;
+  //  VLOG(4) << "input_image: " << input_image;
   VLOG(4) << "input_dims: " << input_dims;
   VLOG(4) << "filter_dims: " << filter_dims;
-  VLOG(4) << "filter_image: " << filter_image;
+  //  VLOG(4) << "filter_image: " << filter_image;
   VLOG(4) << "output_dims: " << output_dims;
   VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
           << out_image_shape["height"];
@@ -875,9 +1190,7 @@ void ConvImageCompute::Conv2d5x5() {
   VLOG(4) << "offset: " << offset;
   VLOG(4) << "dilations.size : " << dilations.size();
   VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
+#endif
 
   CHECK_GE(dilations.size(), 2);
   CHECK(dilations[0] == dilations[1]);
@@ -889,32 +1202,27 @@ void ConvImageCompute::Conv2d5x5() {
 
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
   }
 
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-  VLOG(4) << "w: " << w;
+  auto kernel = kernel_;
 
   cl_int status;
   int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w);
+  status = kernel.setArg(++arg_idx, w_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh);
+  status = kernel.setArg(++arg_idx, nh_blk_);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *input_image);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *filter_image);
   CL_CHECK_FATAL(status);
   if (has_bias) {
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "set bias_image: ";
+#endif
     status = kernel.setArg(++arg_idx, *bias_image);
     CL_CHECK_FATAL(status);
   }
@@ -939,77 +1247,60 @@ void ConvImageCompute::Conv2d5x5() {
   status = kernel.setArg(++arg_idx, output_height);
   CL_CHECK_FATAL(status);
 
-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(default_work_size.data()[1]),
-                  static_cast<size_t>(default_work_size.data()[2])};
-
-  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
+#ifndef LITE_SHUTDOWN_LOG
+  //  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+#endif
 
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
+      global_work_size_,
       cl::NullRange,
       nullptr,
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(out_image, event_);
 }
-
-void ConvImageCompute::Conv2d7x7() {
+void ConvImageCompute::Conv2d7x7opt() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
   auto input_dims = param.x->dims();
   auto paddings = *param.paddings;
   auto strides = param.strides;
-  auto* input_image = param.x->data<float, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto dilations = *param.dilations;
+
+  auto* input_image = param.x->data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
   int input_width = input_dims[3];
   int input_height = input_dims[2];
+  int input_channel = input_dims[1];
   int output_width = output_dims[3];
   int output_height = output_dims[2];
-  int filter_width = filter_dims[3];
-  int filter_height = filter_dims[2];
+  int output_channel = output_dims[1];
+  CHECK_EQ(input_dims[0], output_dims[0]);
+  int batch = input_dims[0];
   auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
       out_image_shape["width"], out_image_shape["height"]);
 
   const bool has_bias = param.bias != nullptr;
   const bool is_element_wise_bias =
       has_bias && param.output->dims() == param.bias->dims();
-  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
-               static_cast<int>(paddings[0]);
-
-  // calc input_c_block
-  auto input_image_shape = InitImageDimInfoWith(input_dims);
-  int input_c_block = input_image_shape["width"] / input_dims[3];
-  int input_c = input_dims[1];
-  auto dilations = *param.dilations;
 
-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
-  VLOG(4) << "============ conv2d params ============";
-  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-          << input_image_shape["height"];
-  VLOG(4) << "input_c_block: " << input_c_block;
-  VLOG(4) << "input_c: " << input_c;
-  VLOG(4) << "input_image: " << input_image;
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "============ conv2d 7x7 params ============";
+  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+  //         << input_image_shape["height"];
+  //  VLOG(4) << "input_image: " << input_image;
   VLOG(4) << "input_dims: " << input_dims;
   VLOG(4) << "filter_dims: " << filter_dims;
-  VLOG(4) << "filter_image: " << filter_image;
+  //  VLOG(4) << "filter_image: " << filter_image;
   VLOG(4) << "output_dims: " << output_dims;
   VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
           << out_image_shape["height"];
@@ -1017,13 +1308,9 @@ void ConvImageCompute::Conv2d7x7() {
   VLOG(4) << "has bias: " << has_bias;
   VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
   VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  VLOG(4) << "offset: " << offset;
   VLOG(4) << "dilations.size : " << dilations.size();
   VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
-
+#endif
   CHECK_GE(dilations.size(), 2);
   CHECK(dilations[0] == dilations[1]);
   CHECK_GE(input_dims.size(), 4);
@@ -1034,32 +1321,24 @@ void ConvImageCompute::Conv2d7x7() {
 
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
   }
 
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-  VLOG(4) << "w: " << w;
+  auto kernel = kernel_;
 
   cl_int status;
   int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w);
+  status = kernel.setArg(++arg_idx, w_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh);
+  status = kernel.setArg(++arg_idx, nh_blk_);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *input_image);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *filter_image);
   CL_CHECK_FATAL(status);
   if (has_bias) {
-    VLOG(4) << "set bias_image: ";
     status = kernel.setArg(++arg_idx, *bias_image);
     CL_CHECK_FATAL(status);
   }
@@ -1068,13 +1347,15 @@ void ConvImageCompute::Conv2d7x7() {
   status = kernel.setArg(++arg_idx, strides[0]);
   CL_CHECK_FATAL(status);
 
-  status = kernel.setArg(++arg_idx, offset);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_c_block);
+  status = kernel.setArg(++arg_idx, paddings[0]);
   CL_CHECK_FATAL(status);
 
   status = kernel.setArg(++arg_idx, dilations[0]);
   CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, batch);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_channel);
+  CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, input_width);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, input_height);
@@ -1084,27 +1365,19 @@ void ConvImageCompute::Conv2d7x7() {
   status = kernel.setArg(++arg_idx, output_height);
   CL_CHECK_FATAL(status);
 
-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(default_work_size.data()[1]),
-                  static_cast<size_t>(default_work_size.data()[2])};
-
-  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
-
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
-      cl::NullRange,
+      global_work_size_,
+      local_work_size_,
       nullptr,
       event_.get());
   CL_CHECK_FATAL(status);
   context.cl_wait_list()->emplace(out_image, event_);
 }
-
 void ConvImageCompute::DepthwiseConv2d3x3s1() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
   auto x_dims = param.x->dims();
   auto filter_dims = param.filter->dims();
@@ -1113,46 +1386,46 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
   auto strides = param.strides;
   auto dilations = *param.dilations;
 
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  auto* input_img = param.x->data<float, cl::Image2D>();
-  auto* filter_img = filter_gpu_image_.data<float, cl::Image2D>();
+  auto* input_img = param.x->data<half_t, cl::Image2D>();
+  auto* filter_img = filter_gpu_image_.data<half_t, cl::Image2D>();
 
   const cl::Image2D* bias_img = nullptr;
   if (param.bias) {
-    bias_img = bias_gpu_image_.data<float, cl::Image2D>();
+    bias_img = bias_gpu_image_.data<half_t, cl::Image2D>();
   }
 
   auto image_shape = InitImageDimInfoWith(output_dims);
 
-  auto* output_img = param.output->mutable_data<float, cl::Image2D>(
+  auto* output_img = param.output->mutable_data<half_t, cl::Image2D>(
       image_shape["width"], image_shape["height"]);
 
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-  int c_block = (output_dims[1] + 3) / 4;
-  int w = output_dims[3];
-  int nh = output_dims[0] * output_dims[2];
-
-  int w_blk_size = 2;
-  int w_blk = (w + w_blk_size - 1) / w_blk_size;
-
-  auto global_work_size = cl::NDRange(c_block, w_blk, nh);
+  auto kernel = kernel_;
 
   cl_int status;
   int arg_idx = 0;
-  status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
+  status = kernel.setArg(arg_idx, c_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(w_blk));
+  status = kernel.setArg(++arg_idx, w_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
+  status = kernel.setArg(++arg_idx, nh_blk_);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *input_img);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *filter_img);
   CL_CHECK_FATAL(status);
+
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "set bias_image: ";
+#endif
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
   status = kernel.setArg(++arg_idx, *output_img);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
@@ -1175,8 +1448,8 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
-      cl::NullRange,
+      global_work_size_,
+      local_work_size_,
       nullptr,
       event_.get());
   CL_CHECK_FATAL(status);
@@ -1184,6 +1457,8 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
 }
 
 void ConvImageCompute::DepthwiseConv2d3x3() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
   auto x_dims = param.x->dims();
   auto filter_dims = param.filter->dims();
@@ -1194,35 +1469,23 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
   int offset = filter_dims[2] / 2 - paddings[0];
   int input_c_block = (x_dims[1] + 3) / 4;
 
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  auto* input_img = param.x->data<float, cl::Image2D>();
-  auto* filter_img = filter_gpu_image_.data<float, cl::Image2D>();
+  auto* input_img = param.x->data<half_t, cl::Image2D>();
+  auto* filter_img = filter_gpu_image_.data<half_t, cl::Image2D>();
 
   const cl::Image2D* bias_img = nullptr;
   if (param.bias) {
-    bias_img = bias_gpu_image_.data<float, cl::Image2D>();
+    bias_img = bias_gpu_image_.data<half_t, cl::Image2D>();
   }
 
   auto image_shape = InitImageDimInfoWith(output_dims);
 
-  auto* output_img = param.output->mutable_data<float, cl::Image2D>(
+  auto* output_img = param.output->mutable_data<half_t, cl::Image2D>(
       image_shape["width"], image_shape["height"]);
 
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-  int c_block = (output_dims[1] + 3) / 4;
-  int w = output_dims[3];
-  int nh = output_dims[0] * output_dims[2];
-  auto global_work_size = cl::NDRange(c_block, w, nh);
+  auto kernel = kernel_;
 
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "setArg";
-  VLOG(4) << "c_block = " << c_block;
-  VLOG(4) << "w = " << w;
-  VLOG(4) << "nh = " << nh;
-
   VLOG(4) << "strides = " << strides[0];
   VLOG(4) << "offset = " << offset;
   VLOG(4) << "dilations = " << dilations[0];
@@ -1231,19 +1494,32 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
   VLOG(4) << "x_dims[2] = " << x_dims[2];
   VLOG(4) << "output_dims[3] = " << output_dims[3];
   VLOG(4) << "output_dims[2] = " << output_dims[2];
+#endif
 
   cl_int status;
   int arg_idx = 0;
-  status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
+  status = kernel.setArg(arg_idx, c_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(w));
+  status = kernel.setArg(++arg_idx, w_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
+  status = kernel.setArg(++arg_idx, nh_blk_);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *input_img);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *filter_img);
   CL_CHECK_FATAL(status);
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "set bias_image: ";
+#endif
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
   status = kernel.setArg(++arg_idx, *output_img);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
@@ -1266,7 +1542,7 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
+      global_work_size_,
       cl::NullRange,
       nullptr,
       event_.get());
@@ -1275,12 +1551,14 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
 }
 
 void ConvImageCompute::DepthwiseConv2d() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
   const auto& param = *param_.get_mutable<param_t>();
   auto input_dims = param.x->dims();
   auto paddings = *param.paddings;
   auto strides = param.strides;
-  auto* input_image = param.x->data<float, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto* input_image = param.x->data<half_t, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<half_t, cl::Image2D>();
   auto filter_dims = param.filter->dims();
   auto output_dims = param.output->dims();
 
@@ -1291,7 +1569,7 @@ void ConvImageCompute::DepthwiseConv2d() {
   int filter_width = filter_dims[3];
   int filter_height = filter_dims[2];
   auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
       out_image_shape["width"], out_image_shape["height"]);
 
   const bool has_bias = param.bias != nullptr;
@@ -1306,24 +1584,15 @@ void ConvImageCompute::DepthwiseConv2d() {
   int input_c = input_dims[1];
   auto dilations = *param.dilations;
 
-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "============ depthwise conv2d params ============";
   VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
           << input_image_shape["height"];
   VLOG(4) << "input_c_block: " << input_c_block;
   VLOG(4) << "input_c: " << input_c;
-  VLOG(4) << "input_image: " << input_image;
+  //  VLOG(4) << "input_image: " << input_image;
   VLOG(4) << "filter_dims: " << filter_dims;
-  VLOG(4) << "filter_image: " << filter_image;
+  //  VLOG(4) << "filter_image: " << filter_image;
   VLOG(4) << "output_dims: " << output_dims;
   VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
           << out_image_shape["height"];
@@ -1334,9 +1603,7 @@ void ConvImageCompute::DepthwiseConv2d() {
   VLOG(4) << "offset: " << offset;
   VLOG(4) << "dilations.size : " << dilations.size();
   VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  VLOG(4) << "default work size{c_block, w, nh}: "
-          << "{" << c_block << ", " << w << ", " << nh << ""
-          << "}";
+#endif
 
   CHECK_GE(dilations.size(), 2);
   CHECK(dilations[0] == dilations[1]);
@@ -1350,32 +1617,27 @@ void ConvImageCompute::DepthwiseConv2d() {
   const cl::Buffer* bias_buf = nullptr;
   const cl::Image2D* bias_image = nullptr;
   if (has_bias) {
-    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+    bias_image = bias_gpu_image_.data<half_t, cl::Image2D>();
   }
 
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_names_[0] << build_options_[0];
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-  VLOG(4) << "kernel_key: " << kernel_key.str();
-  VLOG(4) << "kernel ready ... " << kernel_key.str();
-  VLOG(4) << "w: " << w;
+  auto kernel = kernel_;
 
   cl_int status;
   int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_block);
+  status = kernel.setArg(arg_idx, c_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w);
+  status = kernel.setArg(++arg_idx, w_blk_);
   CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh);
+  status = kernel.setArg(++arg_idx, nh_blk_);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *input_image);
   CL_CHECK_FATAL(status);
   status = kernel.setArg(++arg_idx, *filter_image);
   CL_CHECK_FATAL(status);
   if (has_bias) {
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "set bias_image: ";
+#endif
     status = kernel.setArg(++arg_idx, *bias_image);
     CL_CHECK_FATAL(status);
   }
@@ -1404,19 +1666,16 @@ void ConvImageCompute::DepthwiseConv2d() {
   status = kernel.setArg(++arg_idx, filter_height);
   CL_CHECK_FATAL(status);
 
-  auto global_work_size =
-      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-                  static_cast<size_t>(default_work_size.data()[1]),
-                  static_cast<size_t>(default_work_size.data()[2])};
-
-  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-          << global_work_size[1] << "," << global_work_size[2] << "}";
+#ifndef LITE_SHUTDOWN_LOG
+  //  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
+          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+#endif
 
   status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
-      global_work_size,
+      global_work_size_,
       cl::NullRange,
       nullptr,
       event_.get());
@@ -1425,56 +1684,44 @@ void ConvImageCompute::DepthwiseConv2d() {
 }
 
 void ConvImageCompute::Run() { (this->*impl_)(); }
-
+#undef PROFILE_CONV_KERNEL
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
-// REGISTER_LITE_KERNEL(conv2d,
-//                      kOpenCL,
-//                      kFloat,
-//                      kNCHW,
-//                      paddle::lite::kernels::opencl::ConvCompute,
-//                      def)
-//     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//     .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//     .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//     .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//     .Finalize();
-
 REGISTER_LITE_KERNEL(conv2d,
                      kOpenCL,
-                     kFloat,
+                     kFP16,
                      kImageDefault,
                      paddle::lite::kernels::opencl::ConvImageCompute,
                      image2d)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
     .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Output",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
                                        DATALAYOUT(kImageDefault))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(depthwise_conv2d,
                      kOpenCL,
-                     kFloat,
+                     kFP16,
                      kImageDefault,
                      paddle::lite::kernels::opencl::ConvImageCompute,
                      image2d)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
     .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Output",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
                                        DATALAYOUT(kImageDefault))})
     .Finalize();
diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..c30c271498737acf3b831d7799af1b5b316e95de
--- /dev/null
+++ b/lite/kernels/opencl/conv_image_compute.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/tensor.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
+                                           PRECISION(kFP16),
+                                           DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ConvParam;
+  using kernel_t = void (ConvImageCompute::*)();
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  void Conv2d1x1opt();
+  void Conv2d3x3();
+  void Conv2d3x3opt();
+  void Conv2d5x5();
+  void Conv2d5x5opt();
+  void Conv2d7x7();
+  void Conv2d7x7opt();
+  void DepthwiseConv2d3x3s1();
+  void DepthwiseConv2d3x3();
+  void DepthwiseConv2d();
+
+  kernel_t impl_;
+  std::vector<std::string> kernel_func_names_{};
+  std::vector<std::string> kernel_func_paths_{};
+  std::vector<std::string> build_options_{};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+  Tensor filter_gpu_image_;
+  Tensor bias_gpu_image_;
+  cl::NDRange global_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
+  int c_blk_ = 1;
+  int w_blk_ = 1;
+  int nh_blk_ = 1;
+
+  int default_c_blk_ = 1;
+  int default_w_blk_ = 1;
+  int default_nh_blk_ = 1;
+
+  cl::Kernel kernel_;
+  cl::NDRange local_work_size_ = cl::NDRange{
+      static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
+  bool use_lws{true};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/opencl/conv_image2d_compute_test.cc b/lite/kernels/opencl/conv_image_compute_test.cc
similarity index 67%
rename from lite/kernels/opencl/conv_image2d_compute_test.cc
rename to lite/kernels/opencl/conv_image_compute_test.cc
index 4c81978b405e3acb4bc0e3ecc44b1ec10ac903b7..5563265198a992bcf0b4fbb6e22168e8aeb50e33 100644
--- a/lite/kernels/opencl/conv_image2d_compute_test.cc
+++ b/lite/kernels/opencl/conv_image_compute_test.cc
@@ -20,12 +20,19 @@
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
 
 namespace paddle {
 namespace lite {
 // #define SHADOW_LOG LOG(INFO)
 #define SHADOW_LOG VLOG(4)
-
+#define FP16_MAX_DIFF (1e0)
+#define FP16_ABS_DIFF (1e-1)
+// #define TEST_CONV_IMAGE_ALL_1
+#define TEST_CONV_IMAGE_1x1
+#define TEST_CONV_IMAGE_3x3
+#define TEST_CONV_IMAGE_5x5
+#define TEST_CONV_IMAGE_7x7
 template <typename Dtype1, typename Dtype2>
 static void conv_basic(const Dtype1* din,
                        Dtype2* dout,
@@ -126,6 +133,8 @@ int ConvOutputSize(int input_size,
 
   return output_size;
 }
+
+#ifdef TEST_CONV_IMAGE_1x1
 // #define PRINT_RESULT
 // #define LOOP_TEST
 TEST(conv2d, compute_image2d_1x1) {
@@ -139,300 +148,341 @@ TEST(conv2d, compute_image2d_1x1) {
 
 #ifdef LOOP_TEST
   for (int batch_size = 1; batch_size < 4; ++batch_size) {
-    for (int oc = 4; oc < 10; oc += 1) {   // oc
-      for (int ih = 4; ih < 9; ih += 1) {  // ih
+    for (int oc = 2; oc < 10; oc += 1) {   // oc
+      for (int ih = 2; ih < 9; ih += 1) {  // ih
         int iw = ih;
-        for (int iw = 4; iw < 10; iw += 1) {    // iw
-          for (int ic = 4; ic < 10; ic += 1) {  // ic
-            for (bool bias_flag : {true, false}) {
-              for (std::string relu_flag : {"relu"}) {
+        for (int ic = 2; ic < 10; ic += 1) {  // ic
+          for (bool bias_flag : {true, false}) {
+            for (std::string relu_flag : {""}) {
 #else
   const int batch_size = 1;
-  const int oc = 4;
-  const int ih = 8;
-  const int iw = 8;
-  const int ic = 4;
-  const bool bias_flag = true;
-  const std::string relu_flag = "relu";
+  const int oc = 2;
+  const int ih = 3;
+  const int iw = 3;
+  const int ic = 2;
+  const bool bias_flag = false;
+  const std::string relu_flag = "";
 #endif
-                const int oh = ih;
-                const int ow = iw;
-
-                SHADOW_LOG << "to get kernel ...";
-                auto kernels =
-                    KernelRegistry::Global().Create("conv2d",
-                                                    TARGET(kOpenCL),
-                                                    PRECISION(kFloat),
-                                                    DATALAYOUT(kImageDefault));
-                ASSERT_FALSE(kernels.empty());
-
-                auto kernel = std::move(kernels.front());
-                SHADOW_LOG << "created conv2d_1x1 kernel";
-
-                SHADOW_LOG << "prepare kernel ------";
-
-                lite::Tensor input, filter, bias, output;
-                operators::ConvParam param;
-                param.x = &input;
-                param.filter = &filter;
-                param.output = &output;
-                if (bias_flag) {
-                  param.bias = &bias;
-                }
-                if (relu_flag == "relu") {
-                  param.fuse_relu = true;
-                } else if (relu_flag == "None") {
-                  param.fuse_relu = false;
-                } else if (relu_flag == "relu6") {
-                  param.activation_param.Relu_clipped_coef = 6.f;
-                  param.activation_param.has_active = true;
-                  param.activation_param.active_type =
-                      lite_api::ActivationType::kRelu6;
-                }
+              LOG(INFO) << "---------------------------- "
+                           "conv1x1----------------------- "
+                           "run---------------";
+              const int oh = ih;
+              const int ow = iw;
+              LOG(INFO) << "batch_size:  " << batch_size;
+              LOG(INFO) << "ic:  " << ic;
+              LOG(INFO) << "ih:  " << ih;
+              LOG(INFO) << "iw:  " << iw;
+              LOG(INFO) << "oc:  " << oc;
+              LOG(INFO) << "bias_flag:  " << bias_flag;
+              LOG(INFO) << "relu_flag:  " << relu_flag;
 
-                std::vector<int> paddings = {pad, pad, pad, pad};
-                std::vector<int> dilations = {dilation, dilation};
-
-                param.paddings = std::make_shared<std::vector<int>>(paddings);
-                param.dilations = std::make_shared<std::vector<int>>(dilations);
-                param.strides = std::vector<int>{stride, stride};
-
-                std::unique_ptr<KernelContext> context(new KernelContext);
-                context->As<OpenCLContext>().InitOnce();
-
-                std::unique_ptr<KernelContext> conv_1x1_context(
-                    new KernelContext);
-                context->As<OpenCLContext>().CopySharedTo(
-                    &(conv_1x1_context->As<OpenCLContext>()));
-                kernel->SetContext(std::move(conv_1x1_context));
-
-                const DDim& input_dim =
-                    lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
-
-                const DDim& filter_dim =
-                    lite::DDim{std::vector<int64_t>({oc, ic, ksize, ksize})};
-                const DDim& out_dim =
-                    lite::DDim{std::vector<int64_t>({batch_size, oc, ih, iw})};
-                // element wise bias
-                const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
-
-                param.x->Resize(input_dim);
-                param.filter->Resize(filter_dim);
-                param.output->Resize(out_dim);
-                if (bias_flag) {
-                  param.bias->Resize(bias_dim);
-                }
+              SHADOW_LOG << "to get kernel ...";
+              auto kernels =
+                  KernelRegistry::Global().Create("conv2d",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kFP16),
+                                                  DATALAYOUT(kImageDefault));
+              ASSERT_FALSE(kernels.empty());
 
-                kernel->SetParam(param);
+              auto kernel = std::move(kernels.front());
+              SHADOW_LOG << "created conv2d_1x1 kernel";
 
-                size_t input_image_width = iw * ((ic + 3) / 4);
-                size_t input_image_height = ih * batch_size;
+              SHADOW_LOG << "prepare kernel ------";
 
-                size_t out_image_width = ow * ((oc + 3) / 4);
-                size_t out_image_height = oh * batch_size;
+              lite::Tensor input, filter, bias, output;
+              operators::ConvParam param;
+              param.x = &input;
+              param.filter = &filter;
+              param.output = &output;
+              if (bias_flag) {
+                param.bias = &bias;
+              }
+              if (relu_flag == "relu") {
+                param.fuse_relu = true;
+              } else if (relu_flag == "None") {
+                param.fuse_relu = false;
+              } else if (relu_flag == "relu6") {
+                param.activation_param.Relu_clipped_coef = 6.f;
+                param.activation_param.has_active = true;
+                param.activation_param.active_type =
+                    lite_api::ActivationType::kRelu6;
+              }
 
-                size_t bias_image_width = ow * ((oc + 3) / 4);
-                size_t bias_image_height = oh * batch_size;
+              std::vector<int> paddings = {pad, pad, pad, pad};
+              std::vector<int> dilations = {dilation, dilation};
 
-                size_t filter_image_width = ksize * ((oc + 3) / 4);
-                size_t filter_image_height = ic * ksize;
+              param.paddings = std::make_shared<std::vector<int>>(paddings);
+              param.dilations = std::make_shared<std::vector<int>>(dilations);
+              param.strides = std::vector<int>{stride, stride};
 
-                const size_t cl_image2d_row_pitch{0};
-                const size_t cl_image2d_slice_pitch{0};
+              std::unique_ptr<KernelContext> context(new KernelContext);
+              context->As<OpenCLContext>().InitOnce();
 
-                std::default_random_engine engine;
-                std::uniform_real_distribution<float> gen(-5, 5);
+              std::unique_ptr<KernelContext> conv_1x1_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(conv_1x1_context->As<OpenCLContext>()));
+              kernel->SetContext(std::move(conv_1x1_context));
 
-                std::vector<float> input_v(batch_size * ic * ih * iw);
-                std::vector<float> filter_v(oc * ic * ksize * ksize);
-                std::vector<float> output_v(batch_size * oc * ih * iw);
-                std::vector<float> bias_v(oc);
+              const DDim& input_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
 
-                SHADOW_LOG << "gen input and filter ...";
+              const DDim& filter_dim =
+                  lite::DDim{std::vector<int64_t>({oc, ic, ksize, ksize})};
+              const DDim& out_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, oc, ih, iw})};
+              // element wise bias
+              const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
 
-                for (auto& i : input_v) {
-                  i = gen(engine);
-                }
-                for (auto& f : filter_v) {
-                  f = gen(engine);
-                }
+              param.x->Resize(input_dim);
+              param.filter->Resize(filter_dim);
+              param.output->Resize(out_dim);
+              if (bias_flag) {
+                param.bias->Resize(bias_dim);
+              }
 
-                SHADOW_LOG << "after gen input and filter ...";
-                SHADOW_LOG << "input_v.size(): " << input_v.size();
-                SHADOW_LOG << "filter_v.size(): " << filter_v.size();
-                SHADOW_LOG << "output_v.size(): " << output_v.size();
-                SHADOW_LOG << "bias_v.size(): " << bias_v.size();
-                SHADOW_LOG << "input_dim.production(): "
-                           << input_dim.production();
-                SHADOW_LOG << "filter_dim.production(): "
-                           << filter_dim.production();
-                SHADOW_LOG << "out_dim.production(): " << out_dim.production();
-                SHADOW_LOG << "bias_dim.production(): "
-                           << bias_dim.production();
-                SHADOW_LOG << "4 * input_image_height * input_image_width: "
-                           << 4 * input_image_height * input_image_width;
-                SHADOW_LOG << "4 * filter_image_width * filter_image_height: "
-                           << 4 * filter_image_width * filter_image_height;
-
-                CHECK(input_dim.production() == input_v.size());
-                CHECK_LE(input_dim.production(),
-                         4 * input_image_height * input_image_width);
-                CHECK(filter_dim.production() == filter_v.size());
-                CHECK_LE(filter_dim.production(),
-                         4 * filter_image_width * filter_image_height);
-
-                paddle::lite::CLImageConverterDefault default_convertor;
-                SHADOW_LOG << "set mapped input  ...";
-                std::vector<float> x_image_v(
-                    input_image_width * input_image_height * 4);  // 4 : RGBA
-                std::vector<float> filter_image_v(
-                    filter_image_width * filter_image_height * 4);  // 4 :RGBA
-                std::vector<float> bias_image_v(
-                    bias_image_width * bias_image_height * 4);  // 4 : RGBA
-                std::vector<float> out_image_v(
-                    out_image_width * out_image_height * 4);  // 4 : RGBA
-
-                default_convertor.NCHWToImage(
-                    input_v.data(), x_image_v.data(), input_dim);
-
-                SHADOW_LOG << "set mapped filter  ...";
-                paddle::lite::CLImageConverterNWBlock nw_convertor;
-                nw_convertor.NCHWToImage(
-                    filter_v.data(), filter_image_v.data(), filter_dim);
-
-                auto* input_image2d = input.mutable_data<float, cl::Image2D>(
-                    input_image_width, input_image_height, x_image_v.data());
-                // assign filter as target arm
-                filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
-                                                               filter_dim);
-                //                auto* filter_image2d =
-                //                filter.mutable_data<float, cl::Image2D>(
-                //                    filter_image_width,
-                //                    filter_image_height,
-                //                    filter_image_v.data());
-                SHADOW_LOG << "卷积核: ----  ";
-                for (int i = 0; i < filter_v.size(); i++) {
-                  SHADOW_LOG << "(" << i << ")" << filter_v[i];
-                }
+              kernel->SetParam(param);
 
-                SHADOW_LOG << "卷积核1: ----  ";
-                const float* filter_p = filter.data<float>();
-                for (int i = 0; i < filter_v.size(); i++) {
-                  SHADOW_LOG << "(" << i << ")" << *filter_p;
-                  filter_p++;
-                }
-                SHADOW_LOG << "卷积核2: ----  ";
-                const float* filter_p2 = filter.mutable_data<float>();
-                for (int i = 0; i < filter_v.size(); i++) {
-                  SHADOW_LOG << "(" << i << ")" << *filter_p2;
-                  filter_p2++;
-                }
-                if (bias_flag) {
-                  for (int i = 0; i < bias_dim.production(); ++i) {
-                    bias_v[i] = static_cast<int>(gen(engine));
-                  }
-                  bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
-                                                               bias_dim);
-                  //                CLImageConverterFolder folder_convertor;
-                  //                folder_convertor.NCHWToImage(
-                  //                    bias_v.data(), bias_image_v.data(),
-                  //                    bias_dim);
-                  //
-                  //                auto* bias_data = bias.mutable_data<float,
-                  //                cl::Image2D>(
-                  //                    bias_image_width, bias_image_height,
-                  //                    bias_image_v.data());
-                }
+              size_t input_image_width = iw * ((ic + 3) / 4);
+              size_t input_image_height = ih * batch_size;
 
-                SHADOW_LOG << "resize output  ...";
-                output.Resize(out_dim);
-
-                // cpu conv basic calc
-                lite::Tensor out_ref;
-                out_ref.Resize(out_dim);
-
-                SHADOW_LOG << "prepare kernel ready";
-
-                SHADOW_LOG << "kernel launch ...";
-                kernel->Launch();
-                SHADOW_LOG << "mutable output ...";
-                auto* output_image2d = output.mutable_data<float, cl::Image2D>(
-                    out_image_width, out_image_height);
-
-                auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-                auto* out_ptr = param.output->data<float, cl::Image2D>();
-                auto it = wait_list->find(out_ptr);
-
-                if (it != wait_list->end()) {
-                  SHADOW_LOG << "--- Find the sync event for the target cl "
-                                "tensor. ---";
-                  auto& event = *(it->second);
-                  event.wait();
-                } else {
-                  LOG(FATAL) << "Could not find the sync event for the target"
-                                "cl tensor.";
-                }
+              size_t out_image_width = ow * ((oc + 3) / 4);
+              size_t out_image_height = oh * batch_size;
 
-                TargetWrapperCL::ImgcpySync(out_image_v.data(),
-                                            output.data<float, cl::Image2D>(),
-                                            out_image_width,
-                                            out_image_height,
-                                            cl_image2d_row_pitch,
-                                            cl_image2d_slice_pitch,
-                                            IoDirection::DtoH);
-
-                DDim out_image_shape =
-                    default_convertor.InitImageDimInfoWith(output.dims());
-
-                default_convertor.ImageToNCHW(out_image_v.data(),
-                                              output_v.data(),
-                                              out_image_shape,
-                                              output.dims());
-                SHADOW_LOG << "mutable_data out_ref_data: ";
-
-                // run cpu ref
-                auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
-
-                SHADOW_LOG << " conv_basic beigin ..... ";
-
-                conv_basic<float, float>(input_v.data(),
-                                         out_ref_data,
-                                         batch_size,
-                                         oc,
-                                         oh,
-                                         ow,
-                                         ic,
-                                         ih,
-                                         iw,
-                                         filter_v.data(),
-                                         bias_v.data(),  // mapped_bias,
-                                         group,
-                                         ksize,
-                                         ksize,
-                                         stride,
-                                         stride,
-                                         dilation,
-                                         dilation,
-                                         pad,
-                                         pad,
-                                         bias_flag,
-                                         relu_flag);
-                SHADOW_LOG << " conv_basic end ..... ";
-
-                SHADOW_LOG << " out_dim: " << out_dim;
-                const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
-                    {static_cast<int64_t>(out_image_width),
-                     static_cast<int64_t>(out_image_height)})};
-
-                for (int i = 0; i < out_dim.production(); i++) {
-                  EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
-                  if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
-                    LOG(FATAL) << "error idx:" << i;
-                  }
+              size_t bias_image_width = ow * ((oc + 3) / 4);
+              size_t bias_image_height = oh * batch_size;
+
+              size_t filter_image_width = ksize * ((oc + 3) / 4);
+              size_t filter_image_height = ic * ksize;
+
+              const size_t cl_image2d_row_pitch{0};
+              const size_t cl_image2d_slice_pitch{0};
+
+              std::default_random_engine engine;
+              std::uniform_real_distribution<float> gen(-5, 5);
+
+              std::vector<float> input_v(batch_size * ic * ih * iw);
+              std::vector<float> filter_v(oc * ic * ksize * ksize);
+              std::vector<float> output_v(batch_size * oc * ih * iw);
+              std::vector<float> bias_v(oc);
+
+              SHADOW_LOG << "gen input and filter ...";
+
+              for (auto& i : input_v) {
+                i = gen(engine);
+#ifdef TEST_CONV_IMAGE_ALL_1
+                i = 0.01;
+#endif
+              }
+              for (auto& f : filter_v) {
+                f = gen(engine);
+#ifdef TEST_CONV_IMAGE_ALL_1
+                f = 0.01;
+#endif
+              }
+
+              SHADOW_LOG << "after gen input and filter ...";
+              SHADOW_LOG << "input_v.size(): " << input_v.size();
+              SHADOW_LOG << "filter_v.size(): " << filter_v.size();
+              SHADOW_LOG << "output_v.size(): " << output_v.size();
+              SHADOW_LOG << "bias_v.size(): " << bias_v.size();
+              SHADOW_LOG << "input_dim.production(): "
+                         << input_dim.production();
+              SHADOW_LOG << "filter_dim.production(): "
+                         << filter_dim.production();
+              SHADOW_LOG << "out_dim.production(): " << out_dim.production();
+              SHADOW_LOG << "bias_dim.production(): " << bias_dim.production();
+              SHADOW_LOG << "4 * input_image_height * input_image_width: "
+                         << 4 * input_image_height * input_image_width;
+              SHADOW_LOG << "4 * filter_image_width * filter_image_height: "
+                         << 4 * filter_image_width * filter_image_height;
+
+              CHECK(input_dim.production() == input_v.size());
+              CHECK_LE(input_dim.production(),
+                       4 * input_image_height * input_image_width);
+              CHECK(filter_dim.production() == filter_v.size());
+              CHECK_LE(filter_dim.production(),
+                       4 * filter_image_width * filter_image_height);
+
+              paddle::lite::CLImageConverterDefault default_convertor;
+              SHADOW_LOG << "set mapped input  ...";
+              std::vector<half_t> x_image_v(
+                  input_image_width * input_image_height * 4);  // 4 : RGBA
+              std::vector<half_t> filter_image_v(
+                  filter_image_width * filter_image_height * 4);  // 4 :RGBA
+              std::vector<half_t> bias_image_v(
+                  bias_image_width * bias_image_height * 4);  // 4 : RGBA
+              std::vector<half_t> out_image_v(
+                  out_image_width * out_image_height * 4);  // 4 : RGBA
+
+              default_convertor.NCHWToImage(
+                  input_v.data(), x_image_v.data(), input_dim);
+
+              SHADOW_LOG << "set mapped filter  ...";
+              paddle::lite::CLImageConverterNWBlock nw_convertor;
+              nw_convertor.NCHWToImage(
+                  filter_v.data(), filter_image_v.data(), filter_dim);
+
+              auto* input_image2d = input.mutable_data<half_t, cl::Image2D>(
+                  input_image_width, input_image_height, x_image_v.data());
+              // assign filter as target arm
+              filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
+                                                             filter_dim);
+              SHADOW_LOG << " lite输入 input_v ..... ";
+              for (int i = 0; i < input_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << input_v[i];
+              }
+              SHADOW_LOG << " lite输入 input_image2d ..... ";
+              for (int i = 0; i < x_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << Half2Float(x_image_v[i]);
+              }
+              //                auto* filter_image2d =
+              //                filter.mutable_data<uint16_t, cl::Image2D>(
+              //                    filter_image_width,
+              //                    filter_image_height,
+              //                    filter_image_v.data());
+              SHADOW_LOG << "卷积核 : ----  ";
+              for (int i = 0; i < filter_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_v[i];
+              }
+
+              SHADOW_LOG << "卷积核1: ----  ";
+              const float* filter_p = filter.data<float>();
+              for (int i = 0; i < filter_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << *filter_p;
+                filter_p++;
+              }
+              SHADOW_LOG << "卷积核2:  ----  ";
+              const float* filter_p2 = filter.mutable_data<float>();
+              for (int i = 0; i < filter_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << *filter_p2;
+                filter_p2++;
+              }
+
+              SHADOW_LOG << "卷积核 image : ----  ";
+              for (int i = 0; i < filter_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << Half2Float(filter_image_v[i]);
+              }
+              if (bias_flag) {
+                for (int i = 0; i < bias_dim.production(); ++i) {
+                  bias_v[i] = static_cast<int>(gen(engine));
                 }
+                bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
+                                                             bias_dim);
+                //                CLImageConverterFolder folder_convertor;
+                //                folder_convertor.NCHWToImage(
+                //                    bias_v.data(), bias_image_v.data(),
+                //                    bias_dim);
+                //
+                //                auto* bias_data = bias.mutable_data<float,
+                //                cl::Image2D>(
+                //                    bias_image_width, bias_image_height,
+                //                    bias_image_v.data());
+              }
 
-#ifdef LOOP_TEST
+              SHADOW_LOG << "resize output  ...";
+              output.Resize(out_dim);
+
+              // cpu conv basic calc
+              lite::Tensor out_ref;
+              out_ref.Resize(out_dim);
+
+              SHADOW_LOG << "prepare kernel ready";
+
+              SHADOW_LOG << "kernel launch ...";
+              kernel->Launch();
+              SHADOW_LOG << "mutable output ...";
+              auto* output_image2d = output.mutable_data<half_t, cl::Image2D>(
+                  out_image_width, out_image_height);
+
+              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+              auto* out_ptr = param.output->data<half_t, cl::Image2D>();
+              auto it = wait_list->find(out_ptr);
+
+              if (it != wait_list->end()) {
+                SHADOW_LOG << "--- Find the sync event for the target cl "
+                              "tensor. ---";
+                auto& event = *(it->second);
+                event.wait();
+              } else {
+                LOG(FATAL) << "Could not find the sync event for the target"
+                              "cl tensor.";
+              }
+
+              TargetWrapperCL::ImgcpySync(out_image_v.data(),
+                                          output.data<half_t, cl::Image2D>(),
+                                          out_image_width,
+                                          out_image_height,
+                                          cl_image2d_row_pitch,
+                                          cl_image2d_slice_pitch,
+                                          IoDirection::DtoH);
+
+              DDim out_image_shape =
+                  default_convertor.InitImageDimInfoWith(output.dims());
+
+              default_convertor.ImageToNCHW(out_image_v.data(),
+                                            output_v.data(),
+                                            out_image_shape,
+                                            output.dims());
+              SHADOW_LOG << " lite输出 out_image_v ..... ";
+              for (int i = 0; i < out_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << Half2Float(out_image_v[i]);
+              }
+              SHADOW_LOG << " lite输出 output_v ..... ";
+              for (int i = 0; i < output_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << output_v[i];
+              }
+              SHADOW_LOG << "mutable_data out_ref_data: ";
+
+              // run cpu ref
+              auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+
+              SHADOW_LOG << " conv_basic beigin ..... ";
+
+              conv_basic<float, float>(input_v.data(),
+                                       out_ref_data,
+                                       batch_size,
+                                       oc,
+                                       oh,
+                                       ow,
+                                       ic,
+                                       ih,
+                                       iw,
+                                       filter_v.data(),
+                                       bias_v.data(),  // mapped_bias,
+                                       group,
+                                       ksize,
+                                       ksize,
+                                       stride,
+                                       stride,
+                                       dilation,
+                                       dilation,
+                                       pad,
+                                       pad,
+                                       bias_flag,
+                                       relu_flag);
+              SHADOW_LOG << " conv_basic end ..... ";
+
+              SHADOW_LOG << " out_dim: " << out_dim;
+              const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
+                  {static_cast<int64_t>(out_image_width),
+                   static_cast<int64_t>(out_image_height)})};
+
+              for (int i = 0; i < out_dim.production(); i++) {
+                auto relative_diff =
+                    COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
+                auto abs_diff = COMPUTE_ABS_DIFF(output_v[i], out_ref_data[i]);
+                // EXPECT_LT(relative_diff, FP16_MAX_DIFF);
+                EXPECT_FALSE(relative_diff > FP16_MAX_DIFF &&
+                             abs_diff > FP16_ABS_DIFF);
+                if (relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF) {
+                  LOG(FATAL) << "error idx:" << i << "output_v[" << i
+                             << "]:" << output_v[i] << " "
+                                                       "out_ref_data["
+                             << i << "]:" << out_ref_data[i];
+                }
               }
+#ifdef LOOP_TEST
             }
           }
         }
@@ -445,7 +495,9 @@ TEST(conv2d, compute_image2d_1x1) {
 }
 #undef LOOP_TEST
 #undef PRINT_RESULT
+#endif
 
+#ifdef TEST_CONV_IMAGE_3x3
 // #define PRINT_RESULT
 // #define LOOP_TEST
 TEST(conv2d, compute_image2d_3x3) {
@@ -458,7 +510,7 @@ TEST(conv2d, compute_image2d_3x3) {
   const int dilation = 1;
   const int stride = 2;
   const int group = 1;
-  for (int batch_size = 1; batch_size < 2; ++batch_size) {
+  for (int batch_size = 1; batch_size < 4; ++batch_size) {
     for (int oc = 1; oc < 10; oc += 1) {   // oc
       for (int ih = 5; ih < 9; ih += 1) {  // ih
         int iw = ih;
@@ -466,11 +518,11 @@ TEST(conv2d, compute_image2d_3x3) {
           for (bool bias_flag : {true, false}) {
             for (std::string relu_flag : {/*true,*/ "relu"}) {
 #else
-                const int pad = 1;
-                const int dilation = 1;
+  const int pad = 1;
+  const int dilation = 1;
 
 #if 0  // small scale with group, but result of cpu reference is wrong
-                const int stride = 2;
+const int stride = 2;
                 const int group = 2;
                 const int batch_size = 1;
                 const int ic = 1;
@@ -478,17 +530,17 @@ TEST(conv2d, compute_image2d_3x3) {
                 const int iw = 3;
                 const int oc = 2;
 #else  // big scale with group
-                const int stride = 1;
-                const int group = 32;
-                const int batch_size = 1;
-                const int ic = 32;
-                const int ih = 112;
-                const int iw = 112;
-                const int oc = 32;
+  const int stride = 1;
+  const int group = 32 / 1;
+  const int batch_size = 2;
+  const int ic = 32 / 1;
+  const int ih = 112 / 1;
+  const int iw = 112 / 1;
+  const int oc = 32 / 1;
 #endif
 
-                const bool bias_flag = false;
-                const std::string relu_flag = "relu";
+  const bool bias_flag = false;
+  const std::string relu_flag = "relu";
 #endif
               int filter_channel = ic;
               if (group > 1) {
@@ -503,10 +555,11 @@ TEST(conv2d, compute_image2d_3x3) {
               auto kernels =
                   KernelRegistry::Global().Create("conv2d",
                                                   TARGET(kOpenCL),
-                                                  PRECISION(kFloat),
+                                                  PRECISION(kFP16),
                                                   DATALAYOUT(kImageDefault));
               ASSERT_FALSE(kernels.empty());
-              CHECK(batch_size == 1) << "conv3x3 only supprt batch_size == 1";
+              //              CHECK(batch_size == 1) << "conv3x3 only supprt
+              //              batch_size == 1";
 
               auto kernel = std::move(kernels.front());
               SHADOW_LOG << "created conv2d kernel";
@@ -599,10 +652,10 @@ TEST(conv2d, compute_image2d_3x3) {
 
               SHADOW_LOG << "gen input and filter ...";
               for (int i = 0; i < input_v.size(); ++i) {
-                input_v[i] = i;  // gen(engine);
+                input_v[i] = i * 0.001;  // gen(engine);
               }
               for (int i = 0; i < filter_v.size(); ++i) {
-                filter_v[i] = 1;  // gen(engine);
+                filter_v[i] = 1 * 0.001;  // gen(engine);
               }
 
               SHADOW_LOG << "after gen input and filter ...";
@@ -634,14 +687,14 @@ TEST(conv2d, compute_image2d_3x3) {
 
               paddle::lite::CLImageConverterDefault default_convertor;
               SHADOW_LOG << "set mapped input  ...";
-              std::vector<float> x_image_v(input_image_width *
-                                           input_image_height * 4);  // 4 :RGBA
-              std::vector<float> filter_image_v(
+              std::vector<half_t> x_image_v(input_image_width *
+                                            input_image_height * 4);  // 4 :RGBA
+              std::vector<half_t> filter_image_v(
                   filter_image_width * filter_image_height * 4);  // 4 : RGBA
-              std::vector<float> bias_image_v(
+              std::vector<half_t> bias_image_v(
                   bias_image_width * bias_image_height * 4);  // 4 : RGBA
-              std::vector<float> out_image_v(out_image_width *
-                                             out_image_height * 4);  // 4 :RGBA
+              std::vector<half_t> out_image_v(out_image_width *
+                                              out_image_height * 4);  // 4 :RGBA
 
               default_convertor.NCHWToImage(
                   input_v.data(), x_image_v.data(), input_dim);
@@ -666,7 +719,7 @@ TEST(conv2d, compute_image2d_3x3) {
               for (int i = 0; i < filter_image_v.size(); i++) {
                 SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
               }
-              auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+              auto* input_image2d = input.mutable_data<half_t, cl::Image2D>(
                   input_image_width, input_image_height, x_image_v.data());
               // assign filter as target arm
               filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
@@ -707,11 +760,11 @@ TEST(conv2d, compute_image2d_3x3) {
               SHADOW_LOG << "kernel launch ...";
               kernel->Launch();
               SHADOW_LOG << "mutable output ...";
-              auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+              auto* output_image2d = output.mutable_data<half_t, cl::Image2D>(
                   out_image_width, out_image_height);
 
               auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-              auto* out_ptr = param.output->data<float, cl::Image2D>();
+              auto* out_ptr = param.output->data<half_t, cl::Image2D>();
               auto it = wait_list->find(out_ptr);
 
               if (it != wait_list->end()) {
@@ -725,7 +778,7 @@ TEST(conv2d, compute_image2d_3x3) {
               }
 
               TargetWrapperCL::ImgcpySync(out_image_v.data(),
-                                          output.data<float, cl::Image2D>(),
+                                          output.data<half_t, cl::Image2D>(),
                                           out_image_width,
                                           out_image_height,
                                           cl_image2d_row_pitch,
@@ -793,9 +846,14 @@ TEST(conv2d, compute_image2d_3x3) {
 #endif
 
               for (int i = 0; i < out_dim.production(); i++) {
-                EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
-                if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
-                  LOG(FATAL) << "error idx:" << i;
+                auto relative_diff =
+                    COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
+                EXPECT_LT(relative_diff, FP16_MAX_DIFF);
+                if (relative_diff > FP16_MAX_DIFF) {
+                  LOG(FATAL) << "error idx:" << i << "output_v[" << i
+                             << "]:" << output_v[i] << " "
+                                                       "out_ref_data["
+                             << i << "]:" << out_ref_data[i];
                 }
               }
 
@@ -813,6 +871,10 @@ TEST(conv2d, compute_image2d_3x3) {
 #undef LOOP_TEST
 #undef PRINT_RESULT
 
+#endif
+
+#ifdef TEST_CONV_IMAGE_5x5
+
 // #define PRINT_RESULT
 // #define LOOP_TEST
 TEST(conv2d, compute_image2d_5x5) {
@@ -825,21 +887,23 @@ TEST(conv2d, compute_image2d_5x5) {
 //  int loop_cnt = 0;
 
 #ifdef LOOP_TEST
-  for (int batch_size = 2; batch_size < 4; ++batch_size) {
-    for (int oc = 1; oc < 10; oc += 1) {   // oc
-      for (int ih = 5; ih < 9; ih += 1) {  // ih
+  for (int batch_size = 1; batch_size < 4; ++batch_size) {
+    for (int oc = 1; oc < 5; oc += 1) {    // oc
+      for (int ih = 5; ih < 8; ih += 1) {  // ih
         int iw = ih;
-        for (int ic = 1; ic < 10; ic += 1) {  // ic
+        for (int ic = 2; ic < 6; ic += 1) {  // ic
           for (bool bias_flag : {true, false}) {
-            for (std::string relu_flag : {/*true,*/ "relu"}) {
+            for (std::string relu_flag : {""
+                                          "relu"}) {
 #else
-                const int batch_size = 2;
-                const int oc = 1;
-                const int ih = 5;
-                const int iw = 5;
-                const int ic = 1;
-                const bool bias_flag = true;
-                const std::string relu_flag = "relu";
+  const int batch_size = 2;
+  const int oc = 1;
+  const int ih = 5;
+  const int iw = 5;
+  // ic = 1 会进入depthwise的路由 .
+  const int ic = 2;
+  const bool bias_flag = true;
+  const std::string relu_flag = "relu";
 #endif
 
               const int oh =
@@ -850,7 +914,7 @@ TEST(conv2d, compute_image2d_5x5) {
               auto kernels =
                   KernelRegistry::Global().Create("conv2d",
                                                   TARGET(kOpenCL),
-                                                  PRECISION(kFloat),
+                                                  PRECISION(kFP16),
                                                   DATALAYOUT(kImageDefault));
               ASSERT_FALSE(kernels.empty());
 
@@ -944,10 +1008,10 @@ TEST(conv2d, compute_image2d_5x5) {
 
               SHADOW_LOG << "gen input and filter ...";
               for (auto& i : input_v) {
-                i = gen(engine);
+                i = 0.5 * gen(engine);
               }
               for (auto& f : filter_v) {
-                f = gen(engine);
+                f = 0.5 * gen(engine);
               }
 
               SHADOW_LOG << "after gen input and filter ...";
@@ -975,14 +1039,14 @@ TEST(conv2d, compute_image2d_5x5) {
 
               paddle::lite::CLImageConverterDefault default_convertor;
               SHADOW_LOG << "set mapped input  ...";
-              std::vector<float> x_image_v(input_image_width *
-                                           input_image_height * 4);  // 4 :RGBA
-              std::vector<float> filter_image_v(
+              std::vector<half_t> x_image_v(input_image_width *
+                                            input_image_height * 4);  // 4 :RGBA
+              std::vector<half_t> filter_image_v(
                   filter_image_width * filter_image_height * 4);  // 4 : RGBA
-              std::vector<float> bias_image_v(
+              std::vector<half_t> bias_image_v(
                   bias_image_width * bias_image_height * 4);  // 4 : RGBA
-              std::vector<float> out_image_v(out_image_width *
-                                             out_image_height * 4);  // 4 :RGBA
+              std::vector<half_t> out_image_v(out_image_width *
+                                              out_image_height * 4);  // 4 :RGBA
 
               default_convertor.NCHWToImage(
                   input_v.data(), x_image_v.data(), input_dim);
@@ -1007,7 +1071,7 @@ TEST(conv2d, compute_image2d_5x5) {
               for (int i = 0; i < filter_image_v.size(); i++) {
                 SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
               }
-              auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+              auto* input_image2d = input.mutable_data<half_t, cl::Image2D>(
                   input_image_width, input_image_height, x_image_v.data());
               // assign filter as target arm
               filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
@@ -1048,11 +1112,11 @@ TEST(conv2d, compute_image2d_5x5) {
               SHADOW_LOG << "kernel launch ...";
               kernel->Launch();
               SHADOW_LOG << "mutable output ...";
-              auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+              auto* output_image2d = output.mutable_data<half_t, cl::Image2D>(
                   out_image_width, out_image_height);
 
               auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-              auto* out_ptr = param.output->data<float, cl::Image2D>();
+              auto* out_ptr = param.output->data<half_t, cl::Image2D>();
               auto it = wait_list->find(out_ptr);
 
               if (it != wait_list->end()) {
@@ -1066,7 +1130,7 @@ TEST(conv2d, compute_image2d_5x5) {
               }
 
               TargetWrapperCL::ImgcpySync(out_image_v.data(),
-                                          output.data<float, cl::Image2D>(),
+                                          output.data<half_t, cl::Image2D>(),
                                           out_image_width,
                                           out_image_height,
                                           cl_image2d_row_pitch,
@@ -1127,9 +1191,16 @@ TEST(conv2d, compute_image2d_5x5) {
                    static_cast<int64_t>(out_image_height)})};
 
               for (int i = 0; i < out_dim.production(); i++) {
-                EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
-                if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
-                  LOG(FATAL) << "error idx:" << i;
+                auto relative_diff =
+                    COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
+                auto abs_diff = COMPUTE_ABS_DIFF(output_v[i], out_ref_data[i]);
+                EXPECT_FALSE(relative_diff > FP16_MAX_DIFF &&
+                             abs_diff > FP16_ABS_DIFF);
+                if (relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF) {
+                  LOG(FATAL) << "error idx:" << i << "output_v[" << i
+                             << "]:" << output_v[i] << " "
+                                                       "out_ref_data["
+                             << i << "]:" << out_ref_data[i];
                 }
               }
 
@@ -1146,33 +1217,38 @@ TEST(conv2d, compute_image2d_5x5) {
 }
 #undef LOOP_TEST
 #undef PRINT_RESULT
+#endif
 
+#ifdef TEST_CONV_IMAGE_7x7
+// #undef FP16_ABS_DIFF
+// #define FP16_ABS_DIFF (1e-1)
 // #define LOOP_TEST
 TEST(conv2d, compute_image2d_7x7) {
   // conv infos
   const int ksize = 7;
   const int stride = 1;
-  const int pad = 2;
+  const int pad = 3;
   const int group = 1;
   const int dilation = 1;
 //  int loop_cnt = 0;
 
 #ifdef LOOP_TEST
-  for (int batch_size = 2; batch_size < 4; ++batch_size) {
-    for (int oc = 1; oc < 10; oc += 1) {    // oc
-      for (int ih = 7; ih < 15; ih += 1) {  // ih
+  for (int batch_size = 1; batch_size < 4; ++batch_size) {
+    for (int oc = 1; oc < 6; oc += 1) {    // oc
+      for (int ih = 7; ih < 8; ih += 1) {  // ih
         int iw = ih;
-        for (int ic = 1; ic < 10; ic += 1) {  // ic
-          for (bool bias_flag : {true, false}) {
-            for (std::string relu_flag : {"relu"}) {
+        for (int ic = 2; ic < 4; ic += 1) {  // ic
+          for (bool bias_flag : {false, true}) {
+            for (std::string relu_flag : {"", "relu"}) {
 #else
-                const int batch_size = 2;
-                const int oc = 1;
-                const int ih = 7;
-                const int iw = 7;
-                const int ic = 1;
-                const bool bias_flag = false;
-                const std::string relu_flag = "";
+  const int batch_size = 2;
+  const int oc = 1;
+  const int ih = 7;
+  const int iw = 7;
+  // ic = 1会进入 depthwise路由
+  const int ic = 2;
+  const bool bias_flag = false;
+  const std::string relu_flag = "";
 #endif
 
               const int oh =
@@ -1183,7 +1259,7 @@ TEST(conv2d, compute_image2d_7x7) {
               auto kernels =
                   KernelRegistry::Global().Create("conv2d",
                                                   TARGET(kOpenCL),
-                                                  PRECISION(kFloat),
+                                                  PRECISION(kFP16),
                                                   DATALAYOUT(kImageDefault));
               ASSERT_FALSE(kernels.empty());
 
@@ -1270,12 +1346,18 @@ TEST(conv2d, compute_image2d_7x7) {
 
               SHADOW_LOG << "gen input and filter ...";
               for (auto& i : input_v) {
-                i = gen(engine);
-                //                i = 1;
+                i = 0.1 * gen(engine);
+#ifdef TEST_CONV_IMAGE_ALL_1
+                i = 1;
+#endif
               }
+              int fiii = 1;
               for (auto& f : filter_v) {
-                f = gen(engine);
-                //                f = 1;
+                f = 0.1 * gen(engine);
+#ifdef TEST_CONV_IMAGE_ALL_1
+                // f = fiii++;
+                f = 1;
+#endif
               }
               LOG(INFO) << "bias: " << bias_flag;
               LOG(INFO) << "relu: " << relu_flag;
@@ -1308,14 +1390,14 @@ TEST(conv2d, compute_image2d_7x7) {
 
               paddle::lite::CLImageConverterDefault default_convertor;
               SHADOW_LOG << "set mapped input  ...";
-              std::vector<float> x_image_v(input_image_width *
-                                           input_image_height * 4);  // 4 : RGBA
-              std::vector<float> filter_image_v(
+              std::vector<half_t> x_image_v(
+                  input_image_width * input_image_height * 4);  // 4 : RGBA
+              std::vector<half_t> filter_image_v(
                   filter_image_width * filter_image_height * 4);  // 4 : RGBA
-              std::vector<float> bias_image_v(
+              std::vector<half_t> bias_image_v(
                   bias_image_width * bias_image_height * 4);  // 4 : RGBA
-              std::vector<float> out_image_v(out_image_width *
-                                             out_image_height * 4);  // 4 : RGBA
+              std::vector<half_t> out_image_v(
+                  out_image_width * out_image_height * 4);  // 4 : RGBA
 
               default_convertor.NCHWToImage(
                   input_v.data(), x_image_v.data(), input_dim);
@@ -1325,7 +1407,7 @@ TEST(conv2d, compute_image2d_7x7) {
               }
               SHADOW_LOG << "输入image : ----  ";
               for (int i = 0; i < x_image_v.size(); i++) {
-                SHADOW_LOG << "(" << i << ")" << x_image_v[i];
+                SHADOW_LOG << "(" << i << ")" << Half2Float(x_image_v[i]);
               }
               SHADOW_LOG << "set mapped filter  ...";
               CLImageConverterFolder folder_convertor;
@@ -1338,16 +1420,17 @@ TEST(conv2d, compute_image2d_7x7) {
               }
               SHADOW_LOG << "卷积核image: ----  ";
               for (int i = 0; i < filter_image_v.size(); i++) {
-                SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
+                SHADOW_LOG << "(" << i << ")" << Half2Float(filter_image_v[i]);
               }
-              auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+              auto* input_image2d = input.mutable_data<half_t, cl::Image2D>(
                   input_image_width, input_image_height, x_image_v.data());
 
               // assign filter as target arm
               filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
                                                              filter_dim);
 
-              //              auto* filter_image2d = filter.mutable_data<float,
+              //              auto* filter_image2d =
+              // filter.mutable_data < float,
               //              cl::Image2D>(
               //                  filter_image_width,
               //                  filter_image_height,
@@ -1382,11 +1465,11 @@ TEST(conv2d, compute_image2d_7x7) {
               SHADOW_LOG << "kernel launch ...";
               kernel->Launch();
               SHADOW_LOG << "mutable output ...";
-              auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+              auto* output_image2d = output.mutable_data<half_t, cl::Image2D>(
                   out_image_width, out_image_height);
 
               auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-              auto* out_ptr = param.output->data<float, cl::Image2D>();
+              auto* out_ptr = param.output->data<half_t, cl::Image2D>();
               auto it = wait_list->find(out_ptr);
 
               if (it != wait_list->end()) {
@@ -1400,7 +1483,7 @@ TEST(conv2d, compute_image2d_7x7) {
               }
 
               TargetWrapperCL::ImgcpySync(out_image_v.data(),
-                                          output.data<float, cl::Image2D>(),
+                                          output.data<half_t, cl::Image2D>(),
                                           out_image_width,
                                           out_image_height,
                                           cl_image2d_row_pitch,
@@ -1422,7 +1505,7 @@ TEST(conv2d, compute_image2d_7x7) {
 
               SHADOW_LOG << "输出image: ----  ";
               for (int i = 0; i < out_image_v.size(); i++) {
-                SHADOW_LOG << "(" << i << ")" << out_image_v[i];
+                SHADOW_LOG << "(" << i << ")" << Half2Float(out_image_v[i]);
               }
               SHADOW_LOG << "mutable_data out_ref_data: ";
 
@@ -1461,9 +1544,16 @@ TEST(conv2d, compute_image2d_7x7) {
                    static_cast<int64_t>(out_image_height)})};
 
               for (int i = 0; i < out_dim.production(); i++) {
-                EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
-                if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
-                  LOG(FATAL) << "error idx:" << i;
+                auto relative_diff =
+                    COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
+                auto abs_diff = COMPUTE_ABS_DIFF(output_v[i], out_ref_data[i]);
+                EXPECT_FALSE(relative_diff > FP16_MAX_DIFF &&
+                             abs_diff > FP16_ABS_DIFF);
+                if (relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF) {
+                  LOG(FATAL) << "error idx:" << i << "output_v[" << i
+                             << "]:" << output_v[i] << " "
+                                                       "out_ref_data["
+                             << i << "]:" << out_ref_data[i];
                 }
               }
 
@@ -1480,9 +1570,16 @@ TEST(conv2d, compute_image2d_7x7) {
 }
 #undef LOOP_TEST
 #undef PRINT_RESULT
+#endif
+
 #undef SHADOW_LOG
+#undef TEST_CONV_IMAGE_1x1
+#undef TEST_CONV_IMAGE_3x3
+#undef TEST_CONV_IMAGE_5x5
+#undef TEST_CONV_IMAGE_7x7
+#undef TEST_CONV_IMAGE_ALL_1
 
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kImageDefault, image2d);
+USE_LITE_KERNEL(conv2d, kOpenCL, kFP16, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/depthwise_conv2d_compute.cc b/lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
similarity index 100%
rename from lite/kernels/opencl/depthwise_conv2d_compute.cc
rename to lite/kernels/opencl/depthwise_conv2d_buffer_compute.cc
diff --git a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc b/lite/kernels/opencl/depthwise_conv2d_buffer_compute_test.cc
similarity index 100%
rename from lite/kernels/opencl/depthwise_conv2d_compute_test.cc
rename to lite/kernels/opencl/depthwise_conv2d_buffer_compute_test.cc
diff --git a/lite/kernels/opencl/depthwise_conv2d_image2d_compute_test.cc b/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
similarity index 60%
rename from lite/kernels/opencl/depthwise_conv2d_image2d_compute_test.cc
rename to lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
index 1b96ffe0502c3e2d654f88e9c9ac35d20704ca01..79662bd9e346c94ffe3e6fa30ce8c195700cd313 100644
--- a/lite/kernels/opencl/depthwise_conv2d_image2d_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
@@ -21,10 +21,17 @@
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
 
 namespace paddle {
 namespace lite {
 
+#define SHADOW_LOG VLOG(4)
+#define FP16_MAX_DIFF (1e0)
+#define FP16_ABS_DIFF (1e-1)
+// #define TEST_DEPTHWISE_CONV_IMAGE_BASIC
+#define TEST_DEPTHWISE_CONV_IMAGE_3X3
+
 template <typename T, int STRIDE_H = 1, int STRIDE_W = 1>
 void depth_conv(const T* input_data,
                 const lite::DDim& input_dims,
@@ -105,7 +112,9 @@ int ConvOutputSize(int input_size,
   return output_size;
 }
 
-TEST(depthwise_conv2d_basic, compute) {
+#ifdef TEST_DEPTHWISE_CONV_IMAGE_BASIC
+// #define LOOP_TEST
+TEST(depthwise_conv2d, compute_basic) {
   // conv infos
   //  const int ksize = 1;
   const int stride = 1;
@@ -144,7 +153,7 @@ TEST(depthwise_conv2d_basic, compute) {
           auto kernels =
               KernelRegistry::Global().Create("depthwise_conv2d",
                                               TARGET(kOpenCL),
-                                              PRECISION(kFloat),
+                                              PRECISION(kFP16),
                                               DATALAYOUT(kImageDefault));
           ASSERT_FALSE(kernels.empty());
 
@@ -252,14 +261,14 @@ TEST(depthwise_conv2d_basic, compute) {
 
           paddle::lite::CLImageConverterDefault default_convertor;
           VLOG(4) << "set mapped input  ...";
-          std::vector<float> x_image_v(input_image_width * input_image_height *
-                                       4);  // 4 : RGBA
-          std::vector<float> filter_image_v(
+          std::vector<half_t> x_image_v(input_image_width * input_image_height *
+                                        4);  // 4 : RGBA
+          std::vector<half_t> filter_image_v(
               filter_image_width * filter_image_height * 4);  // 4 : RGBA
-          std::vector<float> bias_image_v(bias_image_width * bias_image_height *
+          std::vector<half_t> bias_image_v(bias_image_width *
+                                           bias_image_height * 4);  // 4 : RGBA
+          std::vector<half_t> out_image_v(out_image_width * out_image_height *
                                           4);  // 4 : RGBA
-          std::vector<float> out_image_v(out_image_width * out_image_height *
-                                         4);  // 4 : RGBA
 
           default_convertor.NCHWToImage(
               input_v.data(), x_image_v.data(), input_dim);
@@ -269,9 +278,9 @@ TEST(depthwise_conv2d_basic, compute) {
           nw_convertor.NCHWToImage(
               filter_v.data(), filter_image_v.data(), filter_dim);
 
-          auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+          auto* input_image2d = input.mutable_data<half_t, cl::Image2D>(
               input_image_width, input_image_height, x_image_v.data());
-          auto* filter_image2d = filter.mutable_data<float, cl::Image2D>(
+          auto* filter_image2d = filter.mutable_data<half_t, cl::Image2D>(
               filter_image_width, filter_image_height, filter_image_v.data());
 
           if (bias_flag) {
@@ -284,7 +293,7 @@ TEST(depthwise_conv2d_basic, compute) {
             CLImageConverterFolder folder_convertor;
             folder_convertor.NCHWToImage(
                 bias_v.data(), bias_image_v.data(), bias_dim);
-            auto* bias_data = bias.mutable_data<float, cl::Image2D>(
+            auto* bias_data = bias.mutable_data<half_t, cl::Image2D>(
                 bias_image_width, bias_image_height, bias_image_v.data());
           }
 
@@ -300,11 +309,11 @@ TEST(depthwise_conv2d_basic, compute) {
           VLOG(4) << "kernel launch ...";
           kernel->Launch();
           VLOG(4) << "mutable output ...";
-          auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+          auto* output_image2d = output.mutable_data<half_t, cl::Image2D>(
               out_image_width, out_image_height);
 
           auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-          auto* out_ptr = param.output->data<float, cl::Image2D>();
+          auto* out_ptr = param.output->data<half_t, cl::Image2D>();
           auto it = wait_list->find(out_ptr);
 
           if (it != wait_list->end()) {
@@ -318,7 +327,7 @@ TEST(depthwise_conv2d_basic, compute) {
           }
 
           TargetWrapperCL::ImgcpySync(out_image_v.data(),
-                                      output.data<float, cl::Image2D>(),
+                                      output.data<half_t, cl::Image2D>(),
                                       out_image_width,
                                       out_image_height,
                                       cl_image2d_row_pitch,
@@ -382,134 +391,194 @@ TEST(depthwise_conv2d_basic, compute) {
 // nothing to do.
 #endif
 }
+#endif
 
-TEST(depthwise_conv2d_image2d_fp16, compute) {
-  LOG(INFO) << "to get kernel ...";
-  auto kernels = KernelRegistry::Global().Create("depthwise_conv2d",
-                                                 TARGET(kOpenCL),
-                                                 PRECISION(kFloat),
-                                                 DATALAYOUT(kImageDefault));
-  ASSERT_FALSE(kernels.empty());
-
-  auto kernel = std::move(kernels.front());
-
-  LOG(INFO) << "get kernel";
-  lite::Tensor input, filter, output;
-  operators::ConvParam param;
-  param.x = &input;
-  param.filter = &filter;
-  param.output = &output;
-  std::vector<int> paddings = {0, 0};
-  param.paddings = std::make_shared<std::vector<int>>(paddings);
-  param.strides = std::vector<int>{1, 1};
-  std::vector<int> dilations = {1, 1};
-  param.dilations = std::make_shared<std::vector<int>>(dilations);
-
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-
-  kernel->SetParam(param);
-  std::unique_ptr<KernelContext> dep_context(new KernelContext);
-  context->As<OpenCLContext>().CopySharedTo(
-      &(dep_context->As<OpenCLContext>()));
-  kernel->SetContext(std::move(dep_context));
-
-  LOG(INFO) << "kernel ready";
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> gen(-5, 5);
-  std::vector<float> input_v(1 * 32 * 112 * 112);
-  std::vector<float> filter_v(32 * 1 * 3 * 3);
-  for (auto& i : input_v) {
-    i = gen(engine);
-  }
-  for (auto& f : filter_v) {
-    f = gen(engine);
-  }
+#ifdef TEST_DEPTHWISE_CONV_IMAGE_3X3
+// #define LOOP_TEST
+TEST(depthwise_conv2d, compute_image2d_3x3) {
+  const int fw = 3;
+  const int fh = fw;
+  int dilation = 1;
+  int stride = 1;
+  int pad = 0;
+#ifdef LOOP_TEST
+  // for (int batch_size = 1; batch_size < 2; ++batch_size) {
+  for (int oc = 4; oc < 10; oc += 1) {      // oc = ic
+    for (int ih = 3; ih < 15; ih += 1) {    // ih
+      for (int iw = 3; iw < 15; iw += 1) {  // iw
+#else
+  const int oc = 32;
+  const int ih = 112;
+  const int iw = 112;
+#endif
+        stride = (stride == 1) ? 2 : 1;
+        // pad = (pad == 0) ? 1 : 0;
+        const int fb = oc;
+        const int ic = oc;
+        const int oh = ConvOutputSize(ih, fh, dilation, pad, pad, stride);
+        const int ow = ConvOutputSize(iw, fw, dilation, pad, pad, stride);
+
+        LOG(INFO) << "to get kernel ...";
+        auto kernels =
+            KernelRegistry::Global().Create("depthwise_conv2d",
+                                            TARGET(kOpenCL),
+                                            PRECISION(kFP16),
+                                            DATALAYOUT(kImageDefault));
+        ASSERT_FALSE(kernels.empty());
+
+        auto kernel = std::move(kernels.front());
+
+        LOG(INFO) << "get kernel";
+        lite::Tensor input, filter, output;
+        operators::ConvParam param;
+        param.x = &input;
+        param.filter = &filter;
+        param.output = &output;
+        param.groups = oc;
+        std::vector<int> paddings = {pad, pad, pad, pad};
+        param.paddings = std::make_shared<std::vector<int>>(paddings);
+        param.strides = std::vector<int>{stride, stride};
+        std::vector<int> dilations = {dilation, dilation};
+        param.dilations = std::make_shared<std::vector<int>>(dilations);
+
+        std::unique_ptr<KernelContext> context(new KernelContext);
+        context->As<OpenCLContext>().InitOnce();
+
+        kernel->SetParam(param);
+        std::unique_ptr<KernelContext> dep_context(new KernelContext);
+        context->As<OpenCLContext>().CopySharedTo(
+            &(dep_context->As<OpenCLContext>()));
+        kernel->SetContext(std::move(dep_context));
+
+        LOG(INFO) << "kernel ready";
+        const DDim& input_dim =
+            lite::DDim{std::vector<int64_t>({1, ic, ih, iw})};
+        const DDim& filter_dim =
+            lite::DDim{std::vector<int64_t>({fb, 1, 3, 3})};
+        const DDim& output_dim =
+            lite::DDim{std::vector<int64_t>({1, oc, oh, ow})};
+        input.Resize(input_dim);
+        filter.Resize(filter_dim);
+        output.Resize(output_dim);
+
+        std::default_random_engine engine;
+        std::uniform_real_distribution<float> gen(-5, 5);
+        std::vector<float> input_v(input_dim.production());
+        std::vector<float> filter_v(filter_dim.production());
+        std::vector<float> output_v(output_dim.production());
+        for (auto& i : input_v) {
+          i = gen(engine);
+        }
+        for (auto& f : filter_v) {
+          f = gen(engine);
+        }
 
-  LOG(INFO) << "prepare input";
-  input.Resize({1, 32, 112, 112});
-  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
-  DDim input_image_shape =
-      default_converter->InitImageDimInfoWith(input.dims());
-  LOG(INFO) << "input_image_shape = " << input_image_shape[0] << " "
-            << input_image_shape[1];
-  std::vector<float> input_image_data(input_image_shape.production() *
-                                      4);  // 4 : RGBA
-  default_converter->NCHWToImage(
-      input_v.data(), input_image_data.data(), input.dims());
-  auto* input_image = input.mutable_data<int16_t, cl::Image2D>(
-      input_image_shape[0], input_image_shape[1], input_image_data.data());
-
-  LOG(INFO) << "prepare kernel";
-  filter.Resize({32, 1, 3, 3});
-  CLImageConverterNWBlock* nw_converter = new CLImageConverterNWBlock();
-  DDim filter_image_shape = nw_converter->InitImageDimInfoWith(filter.dims());
-  LOG(INFO) << "filter_image_shape = " << filter_image_shape[0] << " "
-            << filter_image_shape[1];
-  std::vector<float> filter_image_data(filter_image_shape.production() *
-                                       4);  // 4 : RGBA
-  nw_converter->NCHWToImage(
-      filter_v.data(), filter_image_data.data(), filter.dims());
-  auto* filter_image = filter.mutable_data<int16_t, cl::Image2D>(
-      filter_image_shape[0], filter_image_shape[1], filter_image_data.data());
-
-  LOG(INFO) << "launch";
-  output.Resize({1, 32, 110, 110});
-  DDim output_image_shape =
-      default_converter->InitImageDimInfoWith(output.dims());
-  LOG(INFO) << "output_image_shape = " << output_image_shape[0] << " "
-            << output_image_shape[1];
-  auto* output_image = output.mutable_data<int16_t, cl::Image2D>(
-      output_image_shape[0], output_image_shape[1]);
-
-  kernel->Launch();
-
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<int16_t, cl::Image2D>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    LOG(INFO) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-    LOG(INFO) << "Could not find the sync event for the target cl tensor.";
-  }
+        LOG(INFO) << "prepare input";
+        CLImageConverterDefault* default_converter =
+            new CLImageConverterDefault();
+        DDim input_image_shape =
+            default_converter->InitImageDimInfoWith(input.dims());
+        LOG(INFO) << "input_image_shape = " << input_image_shape[0] << " "
+                  << input_image_shape[1];
+        std::vector<half_t> input_image_data(input_image_shape.production() *
+                                             4);  // 4 : RGBA
+        default_converter->NCHWToImage(
+            input_v.data(), input_image_data.data(), input.dims());
+        auto* input_image =
+            input.mutable_data<half_t, cl::Image2D>(input_image_shape[0],
+                                                    input_image_shape[1],
+                                                    input_image_data.data());
+
+        LOG(INFO) << "prepare kernel";
+        filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
+                                                       filter_dim);
+
+        LOG(INFO) << "launch";
+        DDim output_image_shape =
+            default_converter->InitImageDimInfoWith(output.dims());
+        LOG(INFO) << "output_image_shape = " << output_image_shape[0] << " "
+                  << output_image_shape[1];
+        auto* output_image = output.mutable_data<half_t, cl::Image2D>(
+            output_image_shape[0], output_image_shape[1]);
+
+        kernel->Launch();
+
+        auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+        auto* out_ptr = param.output->data<half_t, cl::Image2D>();
+        auto it = wait_list->find(out_ptr);
+        if (it != wait_list->end()) {
+          VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+          LOG(INFO) << "--- Find the sync event for the target cl tensor. ---";
+          auto& event = *(it->second);
+          event.wait();
+        } else {
+          LOG(FATAL)
+              << "Could not find the sync event for the target cl tensor.";
+          LOG(INFO)
+              << "Could not find the sync event for the target cl tensor.";
+        }
 
-  lite::Tensor output_ref;
-  output_ref.Resize({1, 32, 110, 110});
-  auto* output_ref_data = output_ref.mutable_data<float>(TARGET(kARM));
-  depth_conv<float, 1, 1>(input_v.data(),
-                          input.dims(),
-                          filter_v.data(),
-                          filter.dims(),
-                          output_ref_data,
-                          output_ref.dims());
-
-  const size_t cl_image2d_row_pitch{0};
-  const size_t cl_image2d_slice_pitch{0};
-
-  float* output_image_data = new float[output_image_shape.production() * 4];
-  TargetWrapperCL::ImgcpySync(output_image_data,
-                              output_image,
-                              output_image_shape[0],
-                              output_image_shape[1],
-                              cl_image2d_row_pitch,
-                              cl_image2d_slice_pitch,
-                              IoDirection::DtoH);
-
-  float* output_data = new float[output_image_shape.production() * 4];
-  default_converter->ImageToNCHW(
-      output_image_data, output_data, output_image_shape, output.dims());
-
-  LOG(INFO) << "output_data vs output_ref_data";
-  for (int i = 0; i < output.dims().production(); i++) {
-    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4);
-    LOG(INFO) << output_data[i] << " " << output_ref_data[i];
+        lite::Tensor out_ref;
+        out_ref.Resize(output_dim);
+        auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+        if (stride == 1) {
+          depth_conv<float, 1, 1>(input_v.data(),
+                                  input.dims(),
+                                  filter_v.data(),
+                                  filter.dims(),
+                                  out_ref_data,
+                                  out_ref.dims());
+        } else if (stride == 2) {
+          depth_conv<float, 2, 2>(input_v.data(),
+                                  input.dims(),
+                                  filter_v.data(),
+                                  filter.dims(),
+                                  out_ref_data,
+                                  out_ref.dims());
+        }
+
+        const size_t cl_image2d_row_pitch{0};
+        const size_t cl_image2d_slice_pitch{0};
+
+        std::vector<half_t> output_image_data(output_image_shape.production() *
+                                              4);
+        TargetWrapperCL::ImgcpySync(output_image_data.data(),
+                                    output_image,
+                                    output_image_shape[0],
+                                    output_image_shape[1],
+                                    cl_image2d_row_pitch,
+                                    cl_image2d_slice_pitch,
+                                    IoDirection::DtoH);
+
+        default_converter->ImageToNCHW(output_image_data.data(),
+                                       output_v.data(),
+                                       output_image_shape,
+                                       output.dims());
+
+        LOG(INFO) << "output_data vs output_ref_data";
+        for (int i = 0; i < output.dims().production(); i++) {
+          auto relative_diff =
+              COMPUTE_RELATIVE_DIFF(output_v[i], out_ref_data[i]);
+          auto abs_diff = COMPUTE_ABS_DIFF(output_v[i], out_ref_data[i]);
+          EXPECT_FALSE(relative_diff > FP16_MAX_DIFF &&
+                       abs_diff > FP16_ABS_DIFF);
+          if (relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF) {
+            LOG(FATAL) << "error idx:" << i << "output_v[" << i
+                       << "]:" << output_v[i] << " "
+                                                 "out_ref_data["
+                       << i << "]:" << out_ref_data[i];
+          }
+        }
+#ifdef LOOP_TEST
+      }
+    }
   }
+#else
+// nothing to do.
+#endif
 }
+#endif
 
 }  // namespace lite
 }  // namespace paddle
-USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFloat, kImageDefault, image2d);
+USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFP16, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/dropout_image_compute.cc b/lite/kernels/opencl/dropout_image_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..490e34a8868a3f625591a1c621aa297bb0639576
--- /dev/null
+++ b/lite/kernels/opencl/dropout_image_compute.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+#include "lite/utils/string.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
+                                                PRECISION(kFP16),
+                                                DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::DropoutParam;
+
+  std::string doc() const override {
+    return "Dropout using cl::Image2D, kFP16";
+  }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/dropout_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    const auto& in_dims = param.x->dims();
+    const auto& out_dims = param.output->dims();
+    auto* x_img = param.x->data<half_t, cl::Image2D>();
+    const float dropout_prob = param.dropout_prob;
+
+    int input_dims[4] = {1, 1, 1, 1};
+    for (int i = 0; i < in_dims.size(); i++) {
+      input_dims[4 - in_dims.size() + i] = in_dims[i];
+    }
+    int out_w = input_dims[3];
+    auto out_image_shape = InitImageDimInfoWith(out_dims);
+    auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
+        out_image_shape["width"], out_image_shape["height"]);
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    cl_int status;
+
+    int arg_idx = 0;
+    status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, dropout_prob);
+    CL_CHECK_FATAL(status);
+
+    const std::vector<size_t>& default_work_size =
+        DefaultWorkSize(out_dims,
+                        DDim(std::vector<DDim::value_type>{
+                            static_cast<int64_t>(out_image_shape["width"]),
+                            static_cast<int64_t>(out_image_shape["height"])}));
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(default_work_size.data()[0]),
+                    static_cast<cl::size_type>(default_work_size.data()[1]),
+                    static_cast<cl::size_type>(default_work_size.data()[2])};
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"dropout"};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(dropout,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::DropoutComputeImage2D,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/dropout_image_compute_test.cc b/lite/kernels/opencl/dropout_image_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d377f169c6a60a80b30e5846337951b495fa0ed
--- /dev/null
+++ b/lite/kernels/opencl/dropout_image_compute_test.cc
@@ -0,0 +1,136 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <memory>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+
+#define FP16_MAX_DIFF (5e-1)
+
+namespace paddle {
+namespace lite {
+
+void dropout(const float* input_data,
+             const DDim& in_dim,
+             float* output_data,
+             const float prob) {
+  for (int i = 0; i < in_dim.production(); i++) {
+    output_data[i] = input_data[i] * (1 - prob);
+  }
+}
+
+TEST(dropout_image2d_fp16, compute) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "dropout", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+
+  auto kernel = std::move(kernels.front());
+
+  LOG(INFO) << "get kernel:" << kernel->doc();
+
+  lite::Tensor x, out;
+  operators::DropoutParam param;
+  param.x = &x;
+  param.output = &out;
+  param.dropout_prob = 0.6;
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> dropout_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(dropout_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(dropout_context));
+
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 11, 107, 107});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 11, 107, 107});
+  x.Resize(in_dim);
+  out.Resize(out_dim);
+
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-5, 5);
+  std::vector<float> input_v(4 * 11 * 107 * 107);
+  for (auto& i : input_v) {
+    i = dist(engine);
+  }
+
+  LOG(INFO) << "prepare input";
+  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  DDim image_shape = default_converter->InitImageDimInfoWith(in_dim);
+  LOG(INFO) << "image_shape = " << image_shape[0] << " " << image_shape[1];
+  std::vector<half_t> x_image_data(image_shape.production() * 4);  // 4 : RGBA
+  default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
+  auto* x_image = x.mutable_data<half_t, cl::Image2D>(
+      image_shape[0], image_shape[1], x_image_data.data());
+  LOG(INFO) << "x_image:" << x_image;
+
+  auto* out_image =
+      out.mutable_data<half_t, cl::Image2D>(image_shape[0], image_shape[1]);
+  LOG(INFO) << "out_image:" << out_image;
+  kernel->Launch();
+
+  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto* out_ptr = param.output->data<half_t, cl::Image2D>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto& event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
+  dropout(input_v.data(), in_dim, out_ref.get(), 0.6);
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+  half_t* out_image_data = new half_t[image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              out_image,
+                              image_shape[0],
+                              image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  float* out_data = new float[image_shape.production() * 4];
+  default_converter->ImageToNCHW(
+      out_image_data, out_data, image_shape, out_dim);
+
+  for (int i = 0; i < out_dim.production(); i++) {
+    auto abs_diff = abs(out_data[i] - out_ref[i]);
+    auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]);
+    EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+              true);
+    if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+      LOG(ERROR) << "error idx:" << i << " out_data[" << i
+                 << "]:" << out_data[i] << " "
+                                           "out_ref["
+                 << i << "]:" << out_ref[i] << " abs_diff:" << abs_diff
+                 << " relative_diff:" << relative_diff
+                 << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(dropout, kOpenCL, kFP16, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/elementwise_add_buffer_compute.cc b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3961ac7583917fdcd761614558c493e6917d3294
--- /dev/null
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
@@ -0,0 +1,112 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/opencl/elementwise_add_buffer_compute.h"
+#include <memory>
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/op_registry.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+void ElementwiseAddCompute::PrepareForRun() {
+  auto& context = ctx_->As<OpenCLContext>();
+  context.cl_context()->AddKernel(
+      kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
+  ele_param_ = param_.get_mutable<param_t>();
+  UpdateParams();
+}
+
+void ElementwiseAddCompute::Run() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  auto* x_buf = ele_param_->X->template data<float, cl::Buffer>();
+  auto* y_buf = ele_param_->Y->template data<float, cl::Buffer>();
+  auto* out_buf = ele_param_->Out->template mutable_data<float, cl::Buffer>(
+      TARGET(kOpenCL));
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_name_ << build_options_;
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << TargetToStr(ele_param_->X->target());
+  VLOG(4) << TargetToStr(ele_param_->Y->target());
+  VLOG(4) << TargetToStr(ele_param_->Out->target());
+#endif
+  int arg_idx = 0;
+  cl_int status = kernel.setArg(arg_idx, *x_buf);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *y_buf);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *out_buf);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, (const int)batch_);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, (const int)channels_);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, (const int)num_);
+  CL_CHECK_FATAL(status);
+
+  auto global_work_size = cl::NDRange{channels_, batch_};
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_buf, event_);
+}
+
+void ElementwiseAddCompute::UpdateParams() {
+  auto axis = ele_param_->axis;
+  const auto& x_dims = ele_param_->X->dims();
+  const auto& y_dims = ele_param_->Y->dims();
+  const auto& out_dims = ele_param_->Out->dims();
+  if (axis < 0) {
+    axis = static_cast<int>(x_dims.size() - y_dims.size());
+  }
+  for (int i = 0; i < axis; ++i) {
+    batch_ *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels_ *= y_dims[i];
+  }
+  for (int i = static_cast<int>(y_dims.size() + axis); i < x_dims.size(); ++i) {
+    num_ *= x_dims[i];
+  }
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "axis: " << axis;
+  VLOG(4) << "batch: " << batch_;
+  VLOG(4) << "channels: " << channels_;
+  VLOG(4) << "num: " << num_;
+#endif
+}
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+namespace ocl = paddle::lite::kernels::opencl;
+
+REGISTER_LITE_KERNEL(
+    elementwise_add, kOpenCL, kFloat, kNCHW, ocl::ElementwiseAddCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .Finalize();
diff --git a/lite/kernels/opencl/elementwise_add_compute.h b/lite/kernels/opencl/elementwise_add_buffer_compute.h
similarity index 72%
rename from lite/kernels/opencl/elementwise_add_compute.h
rename to lite/kernels/opencl/elementwise_add_buffer_compute.h
index efc7f58f44a066a171b07b497237c4f782c1607c..5a9266ee69b81416d5f4dea9a3eb38aaed7b4165 100644
--- a/lite/kernels/opencl/elementwise_add_compute.h
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.h
@@ -49,28 +49,6 @@ class ElementwiseAddCompute
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
-class ElementwiseAddImageCompute
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFloat),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ElementwiseParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  std::string doc() const override {
-    return "ElementwiseAdd using cl::Image2D, kFloat";
-  }
-
- protected:
-  param_t* ele_param_{nullptr};
-  std::string kernel_func_name_{"elementwise_add"};
-  std::string build_options_{" -DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/opencl/elementwise_add_compute_test.cc b/lite/kernels/opencl/elementwise_add_buffer_compute_test.cc
similarity index 99%
rename from lite/kernels/opencl/elementwise_add_compute_test.cc
rename to lite/kernels/opencl/elementwise_add_buffer_compute_test.cc
index 06f946bca77f2bc43493d2bb7d86d134a030eac5..67b56c747757499574bf5e9ac7535a366ce343da 100644
--- a/lite/kernels/opencl/elementwise_add_compute_test.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute_test.cc
@@ -87,7 +87,7 @@ void elementwise_compute_ref(const dtype *x_data,
       }
     }
   } else {
-    LOG(FATAL) << "unsupported Elementwise type: " << elt_type << std::endl;
+    LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
   }
 }
 
diff --git a/lite/kernels/opencl/elementwise_add_compute.cc b/lite/kernels/opencl/elementwise_add_image_compute.cc
similarity index 64%
rename from lite/kernels/opencl/elementwise_add_compute.cc
rename to lite/kernels/opencl/elementwise_add_image_compute.cc
index 72838b7c49fceec72a34cba242014cb659aeb5d0..6d0ebf638f0a8967e27a657131e1cac89967ee0b 100644
--- a/lite/kernels/opencl/elementwise_add_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/opencl/elementwise_add_compute.h"
+#include "lite/kernels/opencl/elementwise_add_image_compute.h"
 #include <memory>
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/op_registry.h"
@@ -23,80 +23,6 @@ namespace lite {
 namespace kernels {
 namespace opencl {
 
-/* Buffer */
-#if 0
-void ElementwiseAddCompute::PrepareForRun() {
-  auto& context = ctx_->As<OpenCLContext>();
-  context.cl_context()->AddKernel(
-      kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
-  ele_param_ = param_.get_mutable<param_t>();
-  UpdateParams();
-}
-
-void ElementwiseAddCompute::Run() {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  auto* x_buf = ele_param_->X->template data<float, cl::Buffer>();
-  auto* y_buf = ele_param_->Y->template data<float, cl::Buffer>();
-  auto* out_buf = ele_param_->Out->template mutable_data<float, cl::Buffer>(
-      TARGET(kOpenCL));
-  STL::stringstream kernel_key;
-  kernel_key << kernel_func_name_ << build_options_;
-  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-  VLOG(4) << TargetToStr(ele_param_->X->target());
-  VLOG(4) << TargetToStr(ele_param_->Y->target());
-  VLOG(4) << TargetToStr(ele_param_->Out->target());
-  int arg_idx = 0;
-  cl_int status = kernel.setArg(arg_idx, *x_buf);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *y_buf);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *out_buf);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, (const int)batch_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, (const int)channels_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, (const int)num_);
-  CL_CHECK_FATAL(status);
-
-  auto global_work_size = cl::NDRange{channels_, batch_};
-  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-      kernel,
-      cl::NullRange,
-      global_work_size,
-      cl::NullRange,
-      nullptr,
-      event_.get());
-  CL_CHECK_FATAL(status);
-  context.cl_wait_list()->emplace(out_buf, event_);
-}
-
-void ElementwiseAddCompute::UpdateParams() {
-  auto axis = ele_param_->axis;
-  const auto& x_dims = ele_param_->X->dims();
-  const auto& y_dims = ele_param_->Y->dims();
-  const auto& out_dims = ele_param_->Out->dims();
-  if (axis < 0) {
-    axis = static_cast<int>(x_dims.size() - y_dims.size());
-  }
-  for (int i = 0; i < axis; ++i) {
-    batch_ *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    channels_ *= y_dims[i];
-  }
-  for (int i = static_cast<int>(y_dims.size() + axis); i < x_dims.size(); ++i) {
-    num_ *= x_dims[i];
-  }
-  VLOG(4) << "axis: " << axis;
-  VLOG(4) << "batch: " << batch_;
-  VLOG(4) << "channels: " << channels_;
-  VLOG(4) << "num: " << num_;
-}
-#endif
-
-/* Image2D */
 void ElementwiseAddImageCompute::PrepareForRun() {
   ele_param_ = param_.get_mutable<param_t>();
   auto* x = ele_param_->X;
@@ -120,7 +46,7 @@ void ElementwiseAddImageCompute::PrepareForRun() {
                << ", x->dims().size():" << x->dims().size()
                << ", y->dims.size():" << y->dims().size();
   }
-  VLOG(4) << "kernel_func_name_:" << kernel_func_name_;
+  VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
 
   auto& context = ctx_->As<OpenCLContext>();
   context.cl_context()->AddKernel(
@@ -136,6 +62,7 @@ void ElementwiseAddImageCompute::Run() {
   auto* out = ele_param_->Out;
   auto axis = ele_param_->axis;
 
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "x->target():" << TargetToStr(x->target());
   VLOG(4) << "y->target():" << TargetToStr(y->target());
   VLOG(4) << "out->target():" << TargetToStr(out->target());
@@ -143,6 +70,7 @@ void ElementwiseAddImageCompute::Run() {
   VLOG(4) << "y->dims():" << y->dims();
   VLOG(4) << "out->dims():" << out->dims();
   VLOG(4) << "axis:" << axis;
+#endif
 
   paddle::lite::CLImageConverterDefault default_convertor;
   auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
@@ -152,15 +80,17 @@ void ElementwiseAddImageCompute::Run() {
       default_convertor.InitImageDimInfoWith(out->dims());  // w, h
   auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
 
-  auto* x_img = x->data<float, cl::Image2D>();
-  auto* y_img = y->data<float, cl::Image2D>();
-  auto* out_img =
-      out->mutable_data<float, cl::Image2D>(out_img_shape[0], out_img_shape[1]);
+  auto* x_img = x->data<half_t, cl::Image2D>();
+  auto* y_img = y->data<half_t, cl::Image2D>();
+  auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
+                                                         out_img_shape[1]);
 
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
   VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
   VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
           << out_img_shape[1];
+#endif
 
   STL::stringstream kernel_key;
   kernel_key << kernel_func_name_ << build_options_;
@@ -178,8 +108,9 @@ void ElementwiseAddImageCompute::Run() {
   } else if (y_dims.size() == 1) {
     if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
       int tensor_w = x->dims()[x->dims().size() - 1];
+#ifndef LITE_SHUTDOWN_LOG
       VLOG(4) << "tensor_w:" << tensor_w;
-
+#endif
       cl_int status = kernel.setArg(arg_idx, *x_img);
       CL_CHECK_FATAL(status);
       status = kernel.setArg(++arg_idx, *y_img);
@@ -201,7 +132,9 @@ void ElementwiseAddImageCompute::Run() {
 
   auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
                                       static_cast<cl::size_type>(x_img_height)};
+#ifndef LITE_SHUTDOWN_LOG
   VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
+#endif
   auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
       kernel,
       cl::NullRange,
@@ -220,14 +153,7 @@ void ElementwiseAddImageCompute::Run() {
 
 namespace ocl = paddle::lite::kernels::opencl;
 
-// REGISTER_LITE_KERNEL(
-//    elementwise_add, kOpenCL, kFloat, kNCHW, ocl::ElementwiseAddCompute, def)
-//    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//    .Finalize();
-
-// TODO(ysh329): Not fix.
+// TODO(ysh329): May need fix.
 // "Y" may from constant value like conv bias (kARM, need do cl_image_converter
 // on CPU);
 //     may from anther branch like "X" (kOpenCL, nothing to do).
@@ -235,20 +161,20 @@ namespace ocl = paddle::lite::kernels::opencl;
 //     set target of "Y" as kOpenCL temporarily.
 REGISTER_LITE_KERNEL(elementwise_add,
                      kOpenCL,
-                     kFloat,
+                     kFP16,
                      kImageDefault,
                      ocl::ElementwiseAddImageCompute,
                      def)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
     .BindInput("Y",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
                                        DATALAYOUT(kImageDefault))})
     .Finalize();
diff --git a/lite/kernels/opencl/elementwise_mul_compute.h b/lite/kernels/opencl/elementwise_add_image_compute.h
similarity index 80%
rename from lite/kernels/opencl/elementwise_mul_compute.h
rename to lite/kernels/opencl/elementwise_add_image_compute.h
index 1ef968b0282964c090577e3c597ea436892ec7c9..084f0fe7fb18f9abe3c6ef41f10a9e38e31a54fc 100644
--- a/lite/kernels/opencl/elementwise_mul_compute.h
+++ b/lite/kernels/opencl/elementwise_add_image_compute.h
@@ -15,7 +15,7 @@
 
 #include <memory>
 #include <string>
-#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_half.h"
 #include "lite/core/kernel.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/cp_logging.h"
@@ -25,25 +25,25 @@ namespace lite {
 namespace kernels {
 namespace opencl {
 
-class ElementwiseMulFloatImageCompute
+class ElementwiseAddImageCompute
     : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFloat),
+                        PRECISION(kFP16),
                         DATALAYOUT(kImageDefault)> {
  public:
   using param_t = operators::ElementwiseParam;
 
-  std::string doc() const override {
-    return "ElementwiseMul using cl::Image2D(ImageDefault/RGBA), kFP32";
-  }
-
   void PrepareForRun() override;
 
   void Run() override;
 
+  std::string doc() const override {
+    return "ElementwiseAdd using cl::Image2D, kFP16";
+  }
+
  protected:
   param_t* ele_param_{nullptr};
-  std::string kernel_func_name_{"elementwise_mul"};
-  std::string build_options_{"-DCL_DTYPE_float"};
+  std::string kernel_func_name_{"elementwise_add"};
+  std::string build_options_{"-DCL_DTYPE_half"};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/elementwise_add_image_compute_test.cc b/lite/kernels/opencl/elementwise_add_image_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7fb0b07b25007df155e96411342af69f8b885d3
--- /dev/null
+++ b/lite/kernels/opencl/elementwise_add_image_compute_test.cc
@@ -0,0 +1,292 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename dtype>
+void fill_data(dtype *x, const int length, int set_value = -1) {
+  if (set_value == -1) {
+    for (size_t idx = 0; idx < length; ++idx) {
+      x[idx] = idx;
+    }
+  } else if (set_value != -1) {
+    for (size_t idx = 0; idx < length; ++idx) {
+      x[idx] = set_value;
+    }
+  }
+}
+
+template <typename dtype>
+void elementwise_compute_ref(const dtype *x_data,
+                             const dtype *y_data,
+                             dtype *out_data,
+                             const DDim &x_dims,
+                             const DDim &y_dims,
+                             int axis,
+                             const std::string elt_type,
+                             bool use_relu = false) {
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int batch = 1;
+  int channels = 1;
+  int num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    num *= x_dims[i];
+  }
+  VLOG(4) << "axis:" << axis;
+  VLOG(4) << "batch:" << batch;
+  VLOG(4) << "cahnnels:" << channels;
+  VLOG(4) << "num:" << num;
+  // do elementwise add/sub/max/...
+  if (elt_type == "add" && axis == 1 && y_dims.size() == 1) {
+    for (int i = 0; i < x_dims.production(); ++i) {
+      auto w = i % y_dims.production();
+      out_data[i] = x_data[i] + y_data[w];
+    }
+  } else if (elt_type == "add") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype *din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype *dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr + diny_data;
+          if (use_relu) {
+            *dout_ptr = std::max(*dout_ptr, static_cast<dtype>(0));
+          }
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else {
+    LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
+  }
+}
+
+// #define PRINT_RESULT
+// image
+TEST(elementwise_add_image, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img on cpu) -> "
+               "elementwise_add(img) -> "
+               "layout(img2buf on cpu) "
+               "-> host";
+
+  // elementwise_add's 3 kernels selection routing strategy:
+  // --------------------------------------------------------
+  //  1. elementwise_add: Need y_dim.size() == 4
+  //  2. elementwise_add (used by fuse_elementwise_activation op):
+  //                      Need y_dim.size() == 4 && act_type == "relu"
+  //  3. width_add:       Need y_dim.size() == 1 && x_dim.size() == 4 && axis ==
+  //  3
+  //  4. channel_add:     Need y_dim.size() == 1 && x_dim.size() == 4 && axis ==
+  //  1
+
+  // dims
+  const int n = 1;
+  const int c = 3;
+  const int h = 2;
+  const int w = 2;
+
+  const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+  auto out_dim = x_dim;
+  // y_dim / axis / relu_flag
+  std::vector<DDim> y_dim_v{DDim(std::vector<DDim::value_type>{n, c, h, w}),
+                            DDim(std::vector<DDim::value_type>{n, c, h, w}),
+                            DDim(std::vector<DDim::value_type>{w}),
+                            DDim(std::vector<DDim::value_type>{w})};
+  std::vector<int> axis_v{-1, -1, 3, 1};
+  std::vector<bool> relu_flag_v{false, true, false, false};
+  CHECK(y_dim_v.size() == axis_v.size() && axis_v.size() == relu_flag_v.size())
+      << "y_dim_v.size() == axis_v.size() == relu_flag_v.size() should be "
+         "same, and be corresponding "
+         "one by one";
+
+  // start loop
+  for (size_t case_idx = 0; case_idx < y_dim_v.size(); ++case_idx) {
+    auto y_dim = y_dim_v[case_idx];
+    auto axis = axis_v[case_idx];
+    auto relu_flag = relu_flag_v[case_idx];
+    LOG(INFO) << "================== elementwise_add, case_idx:" << case_idx + 1
+              << "/" << y_dim_v.size() << " ===================";
+    LOG(INFO) << "x_dim:" << x_dim;
+    LOG(INFO) << "y_dim:" << y_dim;
+    LOG(INFO) << "out_dim:" << out_dim;
+    LOG(INFO) << "axis:" << axis;
+    LOG(INFO) << "relu_flag:" << relu_flag;
+
+    // tensor
+    VLOG(4) << "set tensors about op param";
+    lite::Tensor eleadd_x, eleadd_y, eleadd_out;
+    eleadd_x.Resize(x_dim);
+    eleadd_y.Resize(y_dim);
+    eleadd_out.Resize(out_dim);
+
+    // initialize tensors
+    VLOG(4) << "initialize tensors";
+    paddle::lite::CLImageConverterDefault default_convertor;
+    // x
+    std::vector<float> x_v(x_dim.production());
+    fill_data<float>(x_v.data(), x_v.size());  // fill with index value
+    auto x_img_shape = default_convertor.InitImageDimInfoWith(x_dim);  // w, h
+    auto x_img_w = x_img_shape[0];
+    auto x_img_h = x_img_shape[1];
+    std::vector<half_t> x_img_v(x_img_w * x_img_h * 4);  // 4: RGBA
+    default_convertor.NCHWToImage(x_v.data(), x_img_v.data(), x_dim);
+    eleadd_x.mutable_data<half_t, cl::Image2D>(
+        x_img_w, x_img_h, x_img_v.data());
+
+    // y
+    std::vector<float> y_v(y_dim.production());
+    fill_data<float>(y_v.data(), y_v.size());  // fill with index value
+    auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dim);  // w, h
+    auto y_img_w = y_img_shape[0];
+    auto y_img_h = y_img_shape[1];
+    std::vector<half_t> y_img_v(y_img_shape[0] * y_img_shape[1] *
+                                4);  // 4: RGBA
+    default_convertor.NCHWToImage(y_v.data(), y_img_v.data(), y_dim);
+    eleadd_y.mutable_data<half_t, cl::Image2D>(
+        y_img_w, y_img_h, y_img_v.data());
+
+    // out
+    auto out_img_shape =
+        default_convertor.InitImageDimInfoWith(out_dim);  // w, h
+    auto out_img_w = out_img_shape[0];
+    auto out_img_h = out_img_shape[1];
+    eleadd_out.mutable_data<half_t, cl::Image2D>(out_img_w, out_img_h);
+
+    std::vector<half_t> out_img_v(out_img_w * out_img_h * 4);
+    fill_data<half_t>(
+        out_img_v.data(), out_img_v.size(), 0);  // fill with zero value
+
+    std::vector<float> out_v(out_dim.production());
+
+    // operator param
+    operators::FusionElementwiseActivationParam
+        fuseEleaddParam;  // enabled if relu_flag is true
+    fuseEleaddParam.X = &eleadd_x;
+    fuseEleaddParam.Y = &eleadd_y;
+    fuseEleaddParam.Out = &eleadd_out;
+    fuseEleaddParam.axis = axis;
+    fuseEleaddParam.act_type = relu_flag ? "relu" : "";
+
+    operators::ElementwiseParam eleaddParam;
+    eleaddParam.X = &eleadd_x;
+    eleaddParam.Y = &eleadd_y;
+    eleaddParam.Out = &eleadd_out;
+    eleaddParam.axis = axis;
+
+    auto op_param = relu_flag ? fuseEleaddParam : eleaddParam;
+
+    // set kernel
+    auto eleadd_img_kernels =
+        KernelRegistry::Global().Create("elementwise_add",
+                                        TARGET(kOpenCL),
+                                        PRECISION(kFP16),
+                                        DATALAYOUT(kImageDefault));
+    ASSERT_FALSE(eleadd_img_kernels.empty());
+
+    auto eleadd_img_kernel = std::move(eleadd_img_kernels.front());
+    VLOG(4) << "get eleadd kernel: " << eleadd_img_kernel->doc();
+
+    // set context and kernel args
+    VLOG(4) << "set context and kernel args";
+    std::unique_ptr<KernelContext> context(new KernelContext);
+    context->As<OpenCLContext>().InitOnce();
+
+    eleadd_img_kernel->SetParam(op_param);
+    std::unique_ptr<KernelContext> eleadd_img_context(new KernelContext);
+    context->As<OpenCLContext>().CopySharedTo(
+        &(eleadd_img_context->As<OpenCLContext>()));
+    eleadd_img_kernel->SetContext(std::move(eleadd_img_context));
+
+    // run kernel
+    VLOG(4) << "run kernel";
+    eleadd_img_kernel->Launch();
+
+    // download gpu result to cpu
+    const size_t cl_image2d_row_pitch{0};
+    const size_t cl_image2d_slice_pitch{0};
+    TargetWrapperCL::ImgcpySync(out_img_v.data(),
+                                eleadd_out.data<half_t, cl::Image2D>(),
+                                out_img_w,
+                                out_img_h,
+                                cl_image2d_row_pitch,
+                                cl_image2d_slice_pitch,
+                                IoDirection::DtoH);
+    default_convertor.ImageToNCHW(
+        out_img_v.data(), out_v.data(), out_img_shape, out_dim);
+
+    // compute cpu reference
+    std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
+    elementwise_compute_ref<float>(x_v.data(),
+                                   y_v.data(),
+                                   out_ref.get(),
+                                   x_dim,
+                                   y_dim,
+                                   op_param.axis,
+                                   "add",
+                                   relu_flag);
+
+#ifdef PRINT_RESULT  // enable to check value of x and y
+    for (int eidx = 0; eidx < out_dim.production(); eidx++) {
+      auto value = out_v[eidx];
+      auto ref_value = out_ref.get()[eidx];
+      LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
+                << out_dim.production() << ", x_v[" << eidx << "]:" << x_v[eidx]
+                << ", value[" << eidx << "]:" << value << ", ref_value[" << eidx
+                << "]:" << ref_value;
+    }
+
+    for (int i = 0; i < y_v.size(); i++) {
+      LOG(INFO) << "y_v[" << i << "]:" << y_v[i];
+    }
+#endif
+
+    for (int eidx = 0; eidx < out_dim.production(); eidx++) {
+      auto value = out_v[eidx];
+      auto ref_value = out_ref.get()[eidx];
+      EXPECT_NEAR(value, ref_value, 1e-6);
+      if (abs(value - ref_value) > 1e-6) {
+        LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
+                  << out_dim.production() << ", value[" << eidx << "]:" << value
+                  << ", ref_value[" << eidx << "]:" << ref_value;
+        break;
+      }
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(elementwise_add, kOpenCL, kFP16, kImageDefault, def);
+USE_LITE_KERNEL(
+    fusion_elementwise_add_activation, kOpenCL, kFP16, kImageDefault, def);
diff --git a/lite/kernels/opencl/elementwise_mul_compute.cc b/lite/kernels/opencl/elementwise_mul_compute.cc
index ab1bf5c2e3162b08d4ecc4f3010f968f9327c013..d096f05278e7dbc2187dd4aaf1b5e945e5b2f395 100644
--- a/lite/kernels/opencl/elementwise_mul_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_compute.cc
@@ -26,13 +26,19 @@ namespace opencl {
 void ElementwiseMulFloatImageCompute::PrepareForRun() {
   ele_param_ = param_.get_mutable<param_t>();
   auto* y = ele_param_->Y;
+  auto* x = ele_param_->X;
   auto y_dims = y->dims();
-  if (y_dims == ele_param_->X->dims()) {
+  auto x_dims = x->dims();
+  if (y_dims == x_dims) {
     kernel_func_name_ = "elementwise_mul";
   } else if (y_dims.size() == 1) {
     kernel_func_name_ = "channel_mul_d1";
   } else if (y_dims.size() == 2) {
-    kernel_func_name_ = "channel_mul_d2";
+    if (x_dims[0] == y_dims[0] && x_dims[1] == y_dims[1]) {
+      kernel_func_name_ = "channel_mul_d2_nc";
+    } else {
+      kernel_func_name_ = "channel_mul_d2_hw";
+    }
   } else if (y_dims.size() == 4) {
     kernel_func_name_ = "channel_mul_d4";
   } else {
@@ -87,7 +93,8 @@ void ElementwiseMulFloatImageCompute::Run() {
 
   int arg_idx = 0;
   auto y_dims = y->dims();
-  if (y_dims == ele_param_->X->dims()) {
+  auto x_dims = x->dims();
+  if (y_dims == x_dims) {
     // kernel: elementwise_mul(channel_mul_d4)
     cl_int status = kernel.setArg(arg_idx, *x_img);
     CL_CHECK_FATAL(status);
@@ -96,7 +103,7 @@ void ElementwiseMulFloatImageCompute::Run() {
     status = kernel.setArg(++arg_idx, *out_img);
     CL_CHECK_FATAL(status);
   } else if (y_dims.size() == 1 || y_dims.size() == 4) {
-    auto tensor_w = x->dims()[x->dims().size() - 1];
+    auto tensor_w = x_dims[x_dims.size() - 1];
     VLOG(4) << "tensor_w:" << tensor_w;
     // kernel: channel_mul_d1 / channel_mul_d4
     cl_int status = kernel.setArg(arg_idx, *x_img);
@@ -108,20 +115,34 @@ void ElementwiseMulFloatImageCompute::Run() {
     status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
     CL_CHECK_FATAL(status);
   } else if (y_dims.size() == 2) {
-    auto y_tensor_h = y->dims()[0];
-    auto y_tensor_w = y->dims()[1];
-    VLOG(4) << "y_tensor_w:" << y_tensor_w << " y_tensor_h:" << y_tensor_h;
-    // kernel: channel_mul_d2
-    cl_int status = kernel.setArg(arg_idx, *x_img);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *y_img);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_img);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_w));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_h));
-    CL_CHECK_FATAL(status);
+    if (x_dims[0] == y_dims[0] && x_dims[1] == y_dims[1]) {
+      auto tensor_w = x_dims[x_dims.size() - 1];
+      VLOG(4) << "tensor_w:" << tensor_w;
+      // kernel: channel_mul_d2_nc
+      cl_int status = kernel.setArg(arg_idx, *x_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *y_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
+      CL_CHECK_FATAL(status);
+    } else {
+      auto y_tensor_h = y->dims()[0];
+      auto y_tensor_w = y->dims()[1];
+      VLOG(4) << "y_tensor_w:" << y_tensor_w << " y_tensor_h:" << y_tensor_h;
+      // kernel: channel_mul_d2_hw
+      cl_int status = kernel.setArg(arg_idx, *x_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *y_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_w));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_h));
+      CL_CHECK_FATAL(status);
+    }
   } else {
     LOG(FATAL) << "ElementwiseMul not supported y_dims.size():"
                << y_dims.size();
diff --git a/lite/kernels/opencl/elementwise_mul_image_compute.cc b/lite/kernels/opencl/elementwise_mul_image_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aa6af2a29bfdedfb5fdd3114693514b6fad13a64
--- /dev/null
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
@@ -0,0 +1,231 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/logging.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class ElementwiseMulImageCompute
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  std::string doc() const override {
+    return "ElementwiseMul using cl::Image2D(ImageDefault/RGBA), kFP32";
+  }
+
+  void PrepareForRun() override {
+    ele_param_ = param_.get_mutable<param_t>();
+    auto* y = ele_param_->Y;
+    auto* x = ele_param_->X;
+    auto bias_dims = y->dims();
+    auto x_dims = x->dims();
+
+    if (bias_dims == x_dims) {
+      kernel_func_name_ = "elementwise_mul";
+    } else {
+      const int bias_dim_size = bias_dims.size();
+      if (bias_dim_size == 1) {
+        kernel_func_name_ = "channel_mul_d1";
+      } else if (bias_dim_size == 2) {
+        kernel_func_name_ = "channel_mul_d2";
+      } else if (bias_dim_size == 3) {
+        kernel_func_name_ = "channel_mul_d3";
+      } else if (bias_dim_size == 4) {
+        kernel_func_name_ = "channel_mul_d4";
+      } else {
+        LOG(FATAL) << "Unsupported ElementwiseMul with x_dims:" << x_dims
+                   << " y_dims:" << bias_dims;
+      }
+    }
+
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+    VLOG(4) << "x_dims:" << x_dims;
+    VLOG(4) << "bias_dims:" << bias_dims;
+    VLOG(4) << "bias_dims.size():" << bias_dims.size();
+
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/elementwise_mul_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+
+    auto* x = ele_param_->X;
+    auto* y = ele_param_->Y;
+    auto* out = ele_param_->Out;
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "x->target():" << TargetToStr(x->target());
+    VLOG(4) << "y->target():" << TargetToStr(y->target());
+    VLOG(4) << "out->target():" << TargetToStr(out->target());
+    VLOG(4) << "x->dims():" << x->dims();
+    VLOG(4) << "y->dims():" << y->dims();
+    VLOG(4) << "out->dims():" << out->dims();
+#endif
+
+    paddle::lite::CLImageConverterDefault default_convertor;
+    auto x_img_shape =
+        default_convertor.InitImageDimInfoWith(x->dims());  // w, h
+    auto x_img_width = x_img_shape[0];
+    auto x_img_height = x_img_shape[1];
+    auto out_img_shape =
+        default_convertor.InitImageDimInfoWith(out->dims());  // w, h
+    auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
+
+    auto* x_img = x->data<half_t, cl::Image2D>();
+    auto* y_img = y->data<half_t, cl::Image2D>();
+    auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
+                                                           out_img_shape[1]);
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
+    VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
+    VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
+            << out_img_shape[1];
+#endif
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    auto bias_dims = y->dims();
+    auto x_dims = x->dims();
+
+    if (bias_dims == x_dims) {
+      // kernel_func_name_ = "elementwise_mul";
+      cl_int status = kernel.setArg(0, *x_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(1, *y_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(2, *out_img);
+      CL_CHECK_FATAL(status);
+    } else {
+      const int bias_dim_size = bias_dims.size();
+      if (bias_dim_size == 1) {
+        // kernel_func_name_ = "channel_mul_d1";
+        const int tensor_w = x_dims[x_dims.size() - 1];
+        cl_int status = kernel.setArg(0, *x_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(1, *y_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(2, *out_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(3, tensor_w);
+        CL_CHECK_FATAL(status);
+      } else if (bias_dim_size == 2) {
+        // kernel_func_name_ = "channel_mul_d2";
+        const int tensor_w = x_dims[x_dims.size() - 1];
+        cl_int status = kernel.setArg(0, *x_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(1, *y_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(2, *out_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(3, tensor_w);
+        CL_CHECK_FATAL(status);
+      } else if (bias_dim_size == 3) {
+        // kernel_func_name_ = "channel_mul_d3";
+        const int tensor_w = x_dims[x_dims.size() - 1];
+        cl_int status = kernel.setArg(0, *x_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(1, *y_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(2, *out_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(3, tensor_w);
+        CL_CHECK_FATAL(status);
+      } else if (bias_dim_size == 4) {
+        // kernel_func_name_ = "channel_mul_d4";
+        const int tensor_w = x_dims[x_dims.size() - 1];
+        cl_int status = kernel.setArg(0, *x_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(1, *y_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(2, *out_img);
+        CL_CHECK_FATAL(status);
+        status = kernel.setArg(3, tensor_w);
+        CL_CHECK_FATAL(status);
+      } else {
+        LOG(FATAL) << "Unsupported ElementwiseMul with x_dims:" << x_dims
+                   << " y_dims:" << bias_dims;
+      }
+    }
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(x_img_width),
+                    static_cast<cl::size_type>(x_img_height)};
+    auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
+#endif
+  }
+
+ protected:
+  param_t* ele_param_{nullptr};
+  std::string kernel_func_name_{"elementwise_mul"};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+namespace ocl = paddle::lite::kernels::opencl;
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::ElementwiseMulImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/elementwise_mul_compute_test.cc b/lite/kernels/opencl/elementwise_mul_image_compute_test.cc
similarity index 81%
rename from lite/kernels/opencl/elementwise_mul_compute_test.cc
rename to lite/kernels/opencl/elementwise_mul_image_compute_test.cc
index 1951d9fb03407d493f58d82e7697f3ea15cc6cf1..cb63f333e6e061984be33ab6063424c941778751 100644
--- a/lite/kernels/opencl/elementwise_mul_compute_test.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute_test.cc
@@ -60,7 +60,23 @@ void elementwise_compute_ref(const dtype *x_data,
     num *= x_dims[i];
   }
 
-  if (x_dims == y_dims || y_dims.size() == 2 || y_dims.size() == 1) {
+  if (x_dims.size() == 4 && y_dims.size() == 2 && x_dims[0] == y_dims[0] &&
+      y_dims[1] == y_dims[1]) {
+    int n = x_dims[0];
+    int c = x_dims[1];
+    int h = x_dims[2];
+    int w = x_dims[3];
+    // case for x_dims: n,c,h,w
+    //          y_dims: n,c
+    for (int i = 0; i < n; ++i) {
+      for (int j = 0; j < c; ++j) {
+        for (int k = 0; k < h * w; ++k) {
+          out_data[i * c * h * w + j * h * w + k] =
+              x_data[i * c * h * w + j * h * w + k] * y_data[j];
+        }
+      }
+    }
+  } else if (x_dims == y_dims || y_dims.size() == 2 || y_dims.size() == 1) {
     for (int i = 0; i < batch; ++i) {
       for (int j = 0; j < channels; ++j) {
         int offset = (i * channels + j) * num;
@@ -90,12 +106,12 @@ void elementwise_compute_ref(const dtype *x_data,
       out_data[x] = x_data[x] * y_data[y];
     }
   } else {
-    LOG(FATAL) << "unsupported Elementwise type: " << elt_type << std::endl;
+    LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
   }
 }
 
 // #define PRINT_RESULT
-TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
+TEST(elementwise_mul_image, compute) {
   LOG(INFO)
       << "main steps of test: host -> layout(buf2img on cpu) -> elemul(img) -> "
          "layout(img2buf on cpu) "
@@ -103,7 +119,7 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
 
   // dims
   const int n = 1;
-  const int c = 3;
+  const int c = 7;
   const int h = 2;
   const int w = 2;
 
@@ -112,6 +128,7 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
   std::vector<DDim> y_dim_v{DDim(std::vector<DDim::value_type>{n, c, 1, 1}),
                             DDim(std::vector<DDim::value_type>{n, c, h, w}),
                             DDim(std::vector<DDim::value_type>{h, w}),
+                            DDim(std::vector<DDim::value_type>{n, c}),
                             DDim(std::vector<DDim::value_type>{w})};
   for (auto y_dim : y_dim_v) {
     LOG(INFO) << "================== elementwise_mul ===================";
@@ -134,9 +151,10 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
     auto x_img_shape = default_convertor.InitImageDimInfoWith(x_dim);  // w, h
     auto x_img_w = x_img_shape[0];
     auto x_img_h = x_img_shape[1];
-    std::vector<float> x_img_v(x_img_w * x_img_h * 4);  // 4: RGBA
+    std::vector<half_t> x_img_v(x_img_w * x_img_h * 4);  // 4: RGBA
     default_convertor.NCHWToImage(x_v.data(), x_img_v.data(), x_dim);
-    elemul_x.mutable_data<float, cl::Image2D>(x_img_w, x_img_h, x_img_v.data());
+    elemul_x.mutable_data<half_t, cl::Image2D>(
+        x_img_w, x_img_h, x_img_v.data());
 
     // y
     std::vector<float> y_v(y_dim.production());
@@ -144,19 +162,21 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
     auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dim);  // w, h
     auto y_img_w = y_img_shape[0];
     auto y_img_h = y_img_shape[1];
-    std::vector<float> y_img_v(y_img_shape[0] * y_img_shape[1] * 4);  // 4: RGBA
+    std::vector<half_t> y_img_v(y_img_shape[0] * y_img_shape[1] *
+                                4);  // 4: RGBA
     default_convertor.NCHWToImage(y_v.data(), y_img_v.data(), y_dim);
-    elemul_y.mutable_data<float, cl::Image2D>(y_img_w, y_img_h, y_img_v.data());
+    elemul_y.mutable_data<half_t, cl::Image2D>(
+        y_img_w, y_img_h, y_img_v.data());
 
     // out
     auto out_img_shape =
         default_convertor.InitImageDimInfoWith(out_dim);  // w, h
     auto out_img_w = out_img_shape[0];
     auto out_img_h = out_img_shape[1];
-    elemul_out.mutable_data<float, cl::Image2D>(out_img_w, out_img_h);
+    elemul_out.mutable_data<half_t, cl::Image2D>(out_img_w, out_img_h);
 
-    std::vector<float> out_img_v(out_img_w * out_img_h * 4);
-    fill_data<float>(
+    std::vector<half_t> out_img_v(out_img_w * out_img_h * 4);
+    fill_data<half_t>(
         out_img_v.data(), out_img_v.size(), 0);  // fill with zero value
 
     std::vector<float> out_v(out_dim.production());
@@ -172,7 +192,7 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
     auto elemul_img_kernels =
         KernelRegistry::Global().Create("elementwise_mul",
                                         TARGET(kOpenCL),
-                                        PRECISION(kFloat),
+                                        PRECISION(kFP16),
                                         DATALAYOUT(kImageDefault));
     ASSERT_FALSE(elemul_img_kernels.empty());
 
@@ -198,7 +218,7 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
     const size_t cl_image2d_row_pitch{0};
     const size_t cl_image2d_slice_pitch{0};
     TargetWrapperCL::ImgcpySync(out_img_v.data(),
-                                elemul_out.data<float, cl::Image2D>(),
+                                elemul_out.data<half_t, cl::Image2D>(),
                                 out_img_w,
                                 out_img_h,
                                 cl_image2d_row_pitch,
@@ -217,14 +237,14 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
                                    elemulParam.axis,
                                    "mul");
 
-#if 0  // enable to check value of x and y
+#ifdef PRINT_RESULT  // enable to check value of x and y
     for (int eidx = 0; eidx < out_dim.production(); eidx++) {
       auto value = out_v[eidx];
       auto ref_value = out_ref.get()[eidx];
-        LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
-                  << out_dim.production() << ", x_v[" << eidx << "]:"
-                  << x_v[eidx] << ", value[" << eidx << "]:" << value
-                  << ", ref_value[" << eidx << "]:" << ref_value;
+      LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
+                << out_dim.production() << ", x_v[" << eidx << "]:" << x_v[eidx]
+                << ", value[" << eidx << "]:" << value << ", ref_value[" << eidx
+                << "]:" << ref_value;
     }
 
     for (int i = 0; i < y_v.size(); i++) {
@@ -249,4 +269,4 @@ TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(elementwise_mul, kOpenCL, kFloat, kImageDefault, def);
+USE_LITE_KERNEL(elementwise_mul, kOpenCL, kFP16, kImageDefault, def);
diff --git a/lite/kernels/opencl/elementwise_sub_image_compute.cc b/lite/kernels/opencl/elementwise_sub_image_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0bc867d7f124582660b7a0a9a95d026d910fc2d3
--- /dev/null
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc
@@ -0,0 +1,181 @@
+// Copyright (c) 2019 PsublePsuble Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/opencl/elementwise_sub_image_compute.h"
+#include <memory>
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/op_registry.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+void ElementwiseSubImageCompute::PrepareForRun() {
+  ele_param_ = param_.get_mutable<param_t>();
+  auto* x = ele_param_->X;
+  auto* y = ele_param_->Y;
+  auto axis = ele_param_->axis;
+
+  if (y->dims().size() == 4) {
+    kernel_func_name_ = "elementwise_sub";  // y: ImageDefault
+  } else if (y->dims().size() == 1) {
+    if (axis == x->dims().size() - 1) {
+      kernel_func_name_ = "width_sub";  // y: ImageDefault
+    } else if (axis == x->dims().size() - 3) {
+      kernel_func_name_ = "channel_sub";  // y: ImageFolder
+    } else {
+      LOG(FATAL) << "ElementwiseSubImage doesn't support axis:" << axis
+                 << ", x->dims().size():" << x->dims().size()
+                 << ", y->dims.size():" << y->dims().size();
+    }
+  } else {
+    LOG(FATAL) << "ElementwiseSubImage doesn't support axis:" << axis
+               << ", x->dims().size():" << x->dims().size()
+               << ", y->dims.size():" << y->dims().size();
+  }
+  VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+
+  auto& context = ctx_->As<OpenCLContext>();
+  context.cl_context()->AddKernel(
+      kernel_func_name_, "image/elementwise_sub_kernel.cl", build_options_);
+}
+
+void ElementwiseSubImageCompute::Run() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+
+  auto* x = ele_param_->X;
+  auto* y = ele_param_->Y;
+  auto* out = ele_param_->Out;
+  auto axis = ele_param_->axis;
+
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "x->target():" << TargetToStr(x->target());
+  VLOG(4) << "y->target():" << TargetToStr(y->target());
+  VLOG(4) << "out->target():" << TargetToStr(out->target());
+  VLOG(4) << "x->dims():" << x->dims();
+  VLOG(4) << "y->dims():" << y->dims();
+  VLOG(4) << "out->dims():" << out->dims();
+  VLOG(4) << "axis:" << axis;
+#endif
+
+  paddle::lite::CLImageConverterDefault default_convertor;
+  auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
+  auto x_img_width = x_img_shape[0];
+  auto x_img_height = x_img_shape[1];
+  auto out_img_shape =
+      default_convertor.InitImageDimInfoWith(out->dims());  // w, h
+  auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
+
+  auto* x_img = x->data<half_t, cl::Image2D>();
+  auto* y_img = y->data<half_t, cl::Image2D>();
+  auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
+                                                         out_img_shape[1]);
+
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
+  VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
+  VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
+          << out_img_shape[1];
+#endif
+
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_name_ << build_options_;
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+  int arg_idx = 0;
+  auto y_dims = y->dims();
+  if (y_dims.size() == 4) {
+    cl_int status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *y_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+  } else if (y_dims.size() == 1) {
+    if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
+      int tensor_w = x->dims()[x->dims().size() - 1];
+#ifndef LITE_SHUTDOWN_LOG
+      VLOG(4) << "tensor_w:" << tensor_w;
+#endif
+      cl_int status = kernel.setArg(arg_idx, *x_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *y_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
+      CL_CHECK_FATAL(status);
+    } else {
+      LOG(FATAL) << "ElementwiseSubImage doesn't support axis:" << axis
+                 << ", x->dims().size():" << x->dims().size()
+                 << ", y->dims.size():" << y->dims().size();
+    }
+  } else {
+    LOG(FATAL) << "ElementwiseSubImage doesn't support axis:" << axis
+               << ", x->dims().size():" << x->dims().size()
+               << ", y->dims.size():" << y->dims().size();
+  }
+
+  auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
+                                      static_cast<cl::size_type>(x_img_height)};
+#ifndef LITE_SHUTDOWN_LOG
+  VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
+#endif
+
+  auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_img, event_);
+}
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+namespace ocl = paddle::lite::kernels::opencl;
+
+// TODO(ysh329): May need fix.
+// "Y" may from constant value like conv bias (kARM, need do cl_image_converter
+// on CPU);
+//     may from anther branch like "X" (kOpenCL, nothing to do).
+// Consider 2 situations have different actions when pass running(pick kernel),
+//     set target of "Y" as kOpenCL temporarily.
+REGISTER_LITE_KERNEL(elementwise_sub,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::ElementwiseSubImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/concat_compute.h b/lite/kernels/opencl/elementwise_sub_image_compute.h
similarity index 64%
rename from lite/kernels/opencl/concat_compute.h
rename to lite/kernels/opencl/elementwise_sub_image_compute.h
index 7bed6a18146d76043fbfcd72236ba39c5607328b..48386b083e5375f8943c04afb1da70a2ed207dbf 100644
--- a/lite/kernels/opencl/concat_compute.h
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2019 PsublePsuble Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 
 #include <memory>
 #include <string>
+#include "lite/backends/opencl/cl_half.h"
 #include "lite/core/kernel.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/cp_logging.h"
@@ -24,27 +25,25 @@ namespace lite {
 namespace kernels {
 namespace opencl {
 
-template <PrecisionType Ptype, DataLayoutType layout>
-class ConcatCompute : public KernelLite<TARGET(kOpenCL), Ptype, layout> {
+class ElementwiseSubImageCompute
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
  public:
-  using param_t = operators::ConcatParam;
+  using param_t = operators::ElementwiseParam;
 
   void PrepareForRun() override;
 
   void Run() override;
 
-  std::string doc();  // override;
+  std::string doc() const override {
+    return "ElementwiseSub using cl::Image2D, kFP16";
+  }
 
-  // protected:
-  // void UpdateParams();
-
-  int axis_size_ = 1;
-  int post_size_ = 1;
-  int pre_size_ = 1;
-  int axis_ = 1;
-  param_t* concat_param_{nullptr};
-  std::string kernel_func_name_{};
-  std::string build_options_{"-DCL_DTYPE_float"};
+ protected:
+  param_t* ele_param_{nullptr};
+  std::string kernel_func_name_{"elementwise_sub"};
+  std::string build_options_{"-DCL_DTYPE_half"};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/elementwise_sub_image_compute_test.cc b/lite/kernels/opencl/elementwise_sub_image_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..61a7aa4447803ff2140f5cc5c9a40ea943f211af
--- /dev/null
+++ b/lite/kernels/opencl/elementwise_sub_image_compute_test.cc
@@ -0,0 +1,292 @@
+// Copyright (c) 2019 PsublePsuble Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename dtype>
+void fill_data(dtype *x, const int length, int set_value = -1) {
+  if (set_value == -1) {
+    for (size_t idx = 0; idx < length; ++idx) {
+      x[idx] = idx;
+    }
+  } else if (set_value != -1) {
+    for (size_t idx = 0; idx < length; ++idx) {
+      x[idx] = set_value;
+    }
+  }
+}
+
+template <typename dtype>
+void elementwise_compute_ref(const dtype *x_data,
+                             const dtype *y_data,
+                             dtype *out_data,
+                             const DDim &x_dims,
+                             const DDim &y_dims,
+                             int axis,
+                             const std::string elt_type,
+                             bool use_relu = false) {
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int batch = 1;
+  int channels = 1;
+  int num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    num *= x_dims[i];
+  }
+  VLOG(4) << "axis:" << axis;
+  VLOG(4) << "batch:" << batch;
+  VLOG(4) << "cahnnels:" << channels;
+  VLOG(4) << "num:" << num;
+  // do elementwise sub/sub/max/...
+  if (elt_type == "sub" && axis == 1 && y_dims.size() == 1) {
+    for (int i = 0; i < x_dims.production(); ++i) {
+      auto w = i % y_dims.production();
+      out_data[i] = x_data[i] - y_data[w];
+    }
+  } else if (elt_type == "sub") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype *din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype *dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr - diny_data;
+          if (use_relu) {
+            *dout_ptr = std::max(*dout_ptr, static_cast<dtype>(0));
+          }
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else {
+    LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
+  }
+}
+
+// #define PRINT_RESULT
+// image
+TEST(elementwise_sub_image, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img on cpu) -> "
+               "elementwise_sub(img) -> "
+               "layout(img2buf on cpu) "
+               "-> host";
+
+  // elementwise_sub's 3 kernels selection routing strategy:
+  // --------------------------------------------------------
+  //  1. elementwise_sub: Need y_dim.size() == 4
+  //  2. elementwise_sub (used by fuse_elementwise_activation op):
+  //                      Need y_dim.size() == 4 && act_type == "relu"
+  //  3. width_sub:       Need y_dim.size() == 1 && x_dim.size() == 4 && axis ==
+  //  3
+  //  4. channel_sub:     Need y_dim.size() == 1 && x_dim.size() == 4 && axis ==
+  //  1
+
+  // dims
+  const int n = 1;
+  const int c = 3;
+  const int h = 2;
+  const int w = 2;
+
+  const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+  auto out_dim = x_dim;
+  // y_dim / axis / relu_flag
+  std::vector<DDim> y_dim_v{DDim(std::vector<DDim::value_type>{n, c, h, w}),
+                            DDim(std::vector<DDim::value_type>{n, c, h, w}),
+                            DDim(std::vector<DDim::value_type>{w}),
+                            DDim(std::vector<DDim::value_type>{w})};
+  std::vector<int> axis_v{-1, -1, 3, 1};
+  std::vector<bool> relu_flag_v{false, true, false, false};
+  CHECK(y_dim_v.size() == axis_v.size() && axis_v.size() == relu_flag_v.size())
+      << "y_dim_v.size() == axis_v.size() == relu_flag_v.size() should be "
+         "same, and be corresponding "
+         "one by one";
+
+  // start loop
+  for (size_t case_idx = 0; case_idx < y_dim_v.size(); ++case_idx) {
+    auto y_dim = y_dim_v[case_idx];
+    auto axis = axis_v[case_idx];
+    auto relu_flag = relu_flag_v[case_idx];
+    LOG(INFO) << "================== elementwise_sub, case_idx:" << case_idx + 1
+              << "/" << y_dim_v.size() << " ===================";
+    LOG(INFO) << "x_dim:" << x_dim;
+    LOG(INFO) << "y_dim:" << y_dim;
+    LOG(INFO) << "out_dim:" << out_dim;
+    LOG(INFO) << "axis:" << axis;
+    LOG(INFO) << "relu_flag:" << relu_flag;
+
+    // tensor
+    VLOG(4) << "set tensors about op param";
+    lite::Tensor elesub_x, elesub_y, elesub_out;
+    elesub_x.Resize(x_dim);
+    elesub_y.Resize(y_dim);
+    elesub_out.Resize(out_dim);
+
+    // initialize tensors
+    VLOG(4) << "initialize tensors";
+    paddle::lite::CLImageConverterDefault default_convertor;
+    // x
+    std::vector<float> x_v(x_dim.production());
+    fill_data<float>(x_v.data(), x_v.size());  // fill with index value
+    auto x_img_shape = default_convertor.InitImageDimInfoWith(x_dim);  // w, h
+    auto x_img_w = x_img_shape[0];
+    auto x_img_h = x_img_shape[1];
+    std::vector<half_t> x_img_v(x_img_w * x_img_h * 4);  // 4: RGBA
+    default_convertor.NCHWToImage(x_v.data(), x_img_v.data(), x_dim);
+    elesub_x.mutable_data<half_t, cl::Image2D>(
+        x_img_w, x_img_h, x_img_v.data());
+
+    // y
+    std::vector<float> y_v(y_dim.production());
+    fill_data<float>(y_v.data(), y_v.size());  // fill with index value
+    auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dim);  // w, h
+    auto y_img_w = y_img_shape[0];
+    auto y_img_h = y_img_shape[1];
+    std::vector<half_t> y_img_v(y_img_shape[0] * y_img_shape[1] *
+                                4);  // 4: RGBA
+    default_convertor.NCHWToImage(y_v.data(), y_img_v.data(), y_dim);
+    elesub_y.mutable_data<half_t, cl::Image2D>(
+        y_img_w, y_img_h, y_img_v.data());
+
+    // out
+    auto out_img_shape =
+        default_convertor.InitImageDimInfoWith(out_dim);  // w, h
+    auto out_img_w = out_img_shape[0];
+    auto out_img_h = out_img_shape[1];
+    elesub_out.mutable_data<half_t, cl::Image2D>(out_img_w, out_img_h);
+
+    std::vector<half_t> out_img_v(out_img_w * out_img_h * 4);
+    fill_data<half_t>(
+        out_img_v.data(), out_img_v.size(), 0);  // fill with zero value
+
+    std::vector<float> out_v(out_dim.production());
+
+    // operator param
+    operators::FusionElementwiseActivationParam
+        fuseElesubParam;  // enabled if relu_flag is true
+    fuseElesubParam.X = &elesub_x;
+    fuseElesubParam.Y = &elesub_y;
+    fuseElesubParam.Out = &elesub_out;
+    fuseElesubParam.axis = axis;
+    fuseElesubParam.act_type = relu_flag ? "relu" : "";
+
+    operators::ElementwiseParam elesubParam;
+    elesubParam.X = &elesub_x;
+    elesubParam.Y = &elesub_y;
+    elesubParam.Out = &elesub_out;
+    elesubParam.axis = axis;
+
+    auto op_param = relu_flag ? fuseElesubParam : elesubParam;
+
+    // set kernel
+    auto elesub_img_kernels =
+        KernelRegistry::Global().Create("elementwise_sub",
+                                        TARGET(kOpenCL),
+                                        PRECISION(kFP16),
+                                        DATALAYOUT(kImageDefault));
+    ASSERT_FALSE(elesub_img_kernels.empty());
+
+    auto elesub_img_kernel = std::move(elesub_img_kernels.front());
+    VLOG(4) << "get elesub kernel: " << elesub_img_kernel->doc();
+
+    // set context and kernel args
+    VLOG(4) << "set context and kernel args";
+    std::unique_ptr<KernelContext> context(new KernelContext);
+    context->As<OpenCLContext>().InitOnce();
+
+    elesub_img_kernel->SetParam(op_param);
+    std::unique_ptr<KernelContext> elesub_img_context(new KernelContext);
+    context->As<OpenCLContext>().CopySharedTo(
+        &(elesub_img_context->As<OpenCLContext>()));
+    elesub_img_kernel->SetContext(std::move(elesub_img_context));
+
+    // run kernel
+    VLOG(4) << "run kernel";
+    elesub_img_kernel->Launch();
+
+    // download gpu result to cpu
+    const size_t cl_image2d_row_pitch{0};
+    const size_t cl_image2d_slice_pitch{0};
+    TargetWrapperCL::ImgcpySync(out_img_v.data(),
+                                elesub_out.data<half_t, cl::Image2D>(),
+                                out_img_w,
+                                out_img_h,
+                                cl_image2d_row_pitch,
+                                cl_image2d_slice_pitch,
+                                IoDirection::DtoH);
+    default_convertor.ImageToNCHW(
+        out_img_v.data(), out_v.data(), out_img_shape, out_dim);
+
+    // compute cpu reference
+    std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
+    elementwise_compute_ref<float>(x_v.data(),
+                                   y_v.data(),
+                                   out_ref.get(),
+                                   x_dim,
+                                   y_dim,
+                                   op_param.axis,
+                                   "sub",
+                                   relu_flag);
+
+#ifdef PRINT_RESULT  // enable to check value of x and y
+    for (int eidx = 0; eidx < out_dim.production(); eidx++) {
+      auto value = out_v[eidx];
+      auto ref_value = out_ref.get()[eidx];
+      LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
+                << out_dim.production() << ", x_v[" << eidx << "]:" << x_v[eidx]
+                << ", value[" << eidx << "]:" << value << ", ref_value[" << eidx
+                << "]:" << ref_value;
+    }
+
+    for (int i = 0; i < y_v.size(); i++) {
+      LOG(INFO) << "y_v[" << i << "]:" << y_v[i];
+    }
+#endif
+
+    for (int eidx = 0; eidx < out_dim.production(); eidx++) {
+      auto value = out_v[eidx];
+      auto ref_value = out_ref.get()[eidx];
+      EXPECT_NEAR(value, ref_value, 1e-6);
+      if (abs(value - ref_value) > 1e-6) {
+        LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
+                  << out_dim.production() << ", value[" << eidx << "]:" << value
+                  << ", ref_value[" << eidx << "]:" << ref_value;
+        break;
+      }
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(elementwise_sub, kOpenCL, kFP16, kImageDefault, def);
+USE_LITE_KERNEL(
+    fusion_elementwise_sub_activation, kOpenCL, kFP16, kImageDefault, def);
diff --git a/lite/kernels/opencl/fc_compute.cc b/lite/kernels/opencl/fc_buffer_compute.cc
similarity index 95%
rename from lite/kernels/opencl/fc_compute.cc
rename to lite/kernels/opencl/fc_buffer_compute.cc
index 1f8ba6ae2f603ba02e4025e63158249a49dbc815..dbdedd136ea6b8c6b06d02d4f6d893e4ea849e8a 100644
--- a/lite/kernels/opencl/fc_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
@@ -57,6 +57,11 @@ class FcCompute
       global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
                                       static_cast<size_t>((n_ + 3) / 4)};
     }
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+
+    if (param.activation_type == "relu") {
+      build_options_ += "-DRELU";
+    }
     auto& context = ctx_->As<OpenCLContext>();
     context.cl_context()->AddKernel(
         kernel_func_name_, "buffer/fc_kernel.cl", build_options_);
@@ -107,7 +112,7 @@ class FcCompute
  private:
   int m_, n_, k_;
   std::string kernel_func_name_{};
-  std::string build_options_{"-DCL_DTYPE=float"};
+  std::string build_options_{"-DCL_DTYPE_float "};
   cl::NDRange global_work_size_;
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
diff --git a/lite/kernels/opencl/fc_compute_test.cc b/lite/kernels/opencl/fc_buffer_compute_test.cc
similarity index 98%
rename from lite/kernels/opencl/fc_compute_test.cc
rename to lite/kernels/opencl/fc_buffer_compute_test.cc
index 863eab6297a88bcb2827c6ed09dfd1cecd7fae2d..7f0c9c49a9920b10ceaa29cd1b548f59d5758f3b 100644
--- a/lite/kernels/opencl/fc_compute_test.cc
+++ b/lite/kernels/opencl/fc_buffer_compute_test.cc
@@ -66,8 +66,6 @@ void PrintData(std::string name, float* a, const int rows, const int cols) {
   }
 }
 
-// buffer
-#if 0  // fc_buffer
 // #define PRINT_RESULT
 #define LOOP_TEST
 TEST(fc, compute) {
@@ -195,9 +193,8 @@ TEST(fc, compute) {
   }      // m
 #endif
 }
-#endif  // fc_buffer
 
 }  // namespace lite
 }  // namespace paddle
 
-// USE_LITE_KERNEL(fc, kOpenCL, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fc, kOpenCL, kFloat, kNCHW, def);
diff --git a/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc b/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d76e00fa85d4ebb6da9d779e9c2b220a2fd731d9
--- /dev/null
+++ b/lite/kernels/opencl/fusion_elementwise_add_activation_buffer_compute.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/elementwise_add_buffer_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
+ public:
+  using param_t = operators::FusionElementwiseActivationParam;
+
+  void PrepareForRun() override {
+    build_options_ += " -DRELU";
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
+    ele_param_ = param_.get_mutable<param_t>();
+    UpdateParams();
+    auto act_t = static_cast<param_t*>(ele_param_)->act_type;
+    VLOG(4) << "act: " << act_t;
+    if (act_t != "relu") {
+      LOG(FATAL) << "Unsupported Activation type: " << act_t;
+    }
+  }
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+namespace ocl = paddle::lite::kernels::opencl;
+REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
+                     kOpenCL,
+                     kFloat,
+                     kNCHW,
+                     ocl::FusionElementwiseAddActivationCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .Finalize();
diff --git a/lite/kernels/opencl/fusion_elementwise_add_activation_compute.cc b/lite/kernels/opencl/fusion_elementwise_add_activation_image_compute.cc
similarity index 61%
rename from lite/kernels/opencl/fusion_elementwise_add_activation_compute.cc
rename to lite/kernels/opencl/fusion_elementwise_add_activation_image_compute.cc
index c6e1510efe075eb0998d087d35b841849cf99bf1..e5c0e29bddf5cd6c25ccf98f05aa7cb091a4be7e 100644
--- a/lite/kernels/opencl/fusion_elementwise_add_activation_compute.cc
+++ b/lite/kernels/opencl/fusion_elementwise_add_activation_image_compute.cc
@@ -12,37 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "lite/backends/opencl/cl_half.h"
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/op_registry.h"
-#include "lite/kernels/opencl/elementwise_add_compute.h"
+#include "lite/kernels/opencl/elementwise_add_image_compute.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace opencl {
 
-/* Buffer */
-#if 0
-class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
- public:
-  using param_t = operators::FusionElementwiseActivationParam;
-
-  void PrepareForRun() override {
-    build_options_ += " -DRELU";
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
-    ele_param_ = param_.get_mutable<param_t>();
-    UpdateParams();
-    auto act_t = static_cast<param_t*>(ele_param_)->act_type;
-    VLOG(4) << "act: " << act_t;
-    if (act_t != "relu") {
-      LOG(FATAL) << "Unsupported Activation type: " << act_t;
-    }
-  }
-};
-#endif
-
 class FusionElementwiseAddActivationImageCompute
     : public ElementwiseAddImageCompute {
  public:
@@ -59,6 +38,7 @@ class FusionElementwiseAddActivationImageCompute
     if (act_t != "relu") {
       LOG(FATAL) << "Unsupported Activation type: " << act_t;
     }
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
   }
 };
 
@@ -68,33 +48,23 @@ class FusionElementwiseAddActivationImageCompute
 }  // namespace paddle
 
 namespace ocl = paddle::lite::kernels::opencl;
-// REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
-//                     kOpenCL,
-//                     kFloat,
-//                     kNCHW,
-//                     ocl::FusionElementwiseAddActivationCompute,
-//                     def)
-//    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//    .Finalize();
 
 REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
                      kOpenCL,
-                     kFloat,
+                     kFP16,
                      kImageDefault,
                      ocl::FusionElementwiseAddActivationImageCompute,
                      def)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
     .BindInput("Y",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
                                        DATALAYOUT(kImageDefault))})
     .Finalize();
diff --git a/lite/kernels/opencl/fusion_elementwise_sub_activation_image_compute.cc b/lite/kernels/opencl/fusion_elementwise_sub_activation_image_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c335d49f653f46791a9122f9903204686ecfe0eb
--- /dev/null
+++ b/lite/kernels/opencl/fusion_elementwise_sub_activation_image_compute.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PsublePsuble Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/elementwise_sub_image_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class FusionElementwiseSubActivationImageCompute
+    : public ElementwiseSubImageCompute {
+ public:
+  using param_t = operators::FusionElementwiseActivationParam;
+
+  void PrepareForRun() override {
+    build_options_ += " -DRELU";
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/elementwise_sub_kernel.cl", build_options_);
+    ele_param_ = param_.get_mutable<param_t>();
+    auto act_t = static_cast<param_t*>(ele_param_)->act_type;
+    VLOG(4) << "act: " << act_t;
+    if (act_t != "relu") {
+      LOG(FATAL) << "Unsupported Activation type: " << act_t;
+    }
+  }
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+namespace ocl = paddle::lite::kernels::opencl;
+
+REGISTER_LITE_KERNEL(fusion_elementwise_sub_activation,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::FusionElementwiseSubActivationImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/grid_sampler_image_compute.cc b/lite/kernels/opencl/grid_sampler_image_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..243737a81331a7159834d30ccfb2fab181baeebe
--- /dev/null
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
@@ -0,0 +1,159 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/logging.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
+                                                  PRECISION(kFP16),
+                                                  DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::GridSamplerParam;
+
+  std::string doc() const override {
+    return "GridSampler using cl::Image2D(ImageDefault/RGBA), kFP16";
+  }
+
+  void PrepareForRun() override {
+    grid_param_ = param_.get_mutable<param_t>();
+
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/grid_sampler_kernel.cl", build_options_);
+    VLOG(4) << "kernel_func_name_:" << kernel_func_name_;
+  }
+
+  void Run() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+
+    auto* x = grid_param_->x;
+    auto* out = grid_param_->out;
+    auto* grid = grid_param_->grid;
+    auto out_dims = out->dims();
+    auto in_dims = x->dims();
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "x->target():" << TargetToStr(x->target());
+    VLOG(4) << "out->target():" << TargetToStr(out->target());
+    VLOG(4) << "x->dims():" << in_dims;
+    VLOG(4) << "out->dims():" << out_dims;
+#endif
+
+    auto out_image_shape = InitImageDimInfoWith(out_dims);
+    auto* x_img = x->data<half_t, cl::Image2D>();
+    // VLOG(4) << "x_image: " << x_img;
+
+    auto* grid_img = x->data<half_t, cl::Image2D>();
+    // VLOG(4) << "grid_img: " << grid_img;
+
+    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
+        out_image_shape["width"], out_image_shape["height"]);
+#ifndef LITE_SHUTDOWN_LOG
+    // VLOG(4) << "out_image" << out_img;
+    VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
+            << out_image_shape["height"];
+#endif
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    int out_height = out_dims[2];
+    int out_width = out_dims[3];
+    auto default_work_size =
+        DefaultWorkSize(out_dims,
+                        DDim(std::vector<DDim::value_type>{
+                            static_cast<int64_t>(out_image_shape["width"]),
+                            static_cast<int64_t>(out_image_shape["height"])}));
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
+            << default_work_size[1] << ", " << default_work_size[2];
+#endif
+    cl_int status = kernel.setArg(arg_idx++, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, *grid_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, out_height);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, out_width);
+    CL_CHECK_FATAL(status);
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
+                    static_cast<cl::size_type>(default_work_size[1]),
+                    static_cast<cl::size_type>(default_work_size[2] / 4)};
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
+            << global_work_size[1] << " " << global_work_size[2];
+#endif
+  }
+
+ protected:
+  param_t* grid_param_{nullptr};
+  std::string kernel_func_name_{"grid_sampler"};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+namespace ocl = paddle::lite::kernels::opencl;
+REGISTER_LITE_KERNEL(grid_sampler,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::GridSamplerImageCompute,
+                     ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Grid",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/grid_sampler_image_compute_test.cc b/lite/kernels/opencl/grid_sampler_image_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d4ecd12695479161821823402dab01a72265264
--- /dev/null
+++ b/lite/kernels/opencl/grid_sampler_image_compute_test.cc
@@ -0,0 +1,258 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <memory>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+
+#define FP16_MAX_DIFF (5e-1)
+
+namespace paddle {
+namespace lite {
+
+void gird_sampler_ref(const float* din,
+                      const DDim& in_dims,
+                      const float* grid,
+                      float* output) {
+  int num = in_dims[0];
+  int channel = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  int spatial_size = height * width;
+
+  auto inbound = [](int x, int y, float x_max, float y_max) {
+    if (x < 0 || x > x_max || y < 0 || y > y_max) {
+      return false;
+    }
+    return true;
+  };
+
+  for (int n = 0; n < num; ++n) {
+    const float* x_n = din + n * channel * height * width;
+    float* out_n = output + n * channel * height * width;
+    const float* grid_n = grid + n * height * width * 2;
+    for (int c = 0; c < channel; ++c) {
+      const float* x_c = x_n + c * spatial_size;
+      float* out_c = out_n + c * spatial_size;
+      for (int s = 0; s < spatial_size; ++s) {
+        float x = grid_n[s * 2];
+        float y = grid_n[s * 2 + 1];
+        float xwf = (x + 1.f) * 0.5 * (width - 1);
+        float ynf = (y + 1.f) * 0.5 * (height - 1);
+        int xw = floor(xwf);
+        int xe = xw + 1;
+        int yn = floor(ynf);
+        int ys = yn + 1;
+
+        float dw = xwf - xw;
+        float de = xe - xwf;
+        float dn = ynf - yn;
+        float ds = ys - ynf;
+
+        float wn = inbound(xw,
+                           yn,
+                           static_cast<float>(width - 1),
+                           static_cast<float>(height - 1))
+                       ? x_c[yn * width + xw]
+                       : 0.f;
+        float en = inbound(xe,
+                           yn,
+                           static_cast<float>(width - 1),
+                           static_cast<float>(height - 1))
+                       ? x_c[yn * width + xe]
+                       : 0.f;
+        float ws = inbound(xw,
+                           ys,
+                           static_cast<float>(width - 1),
+                           static_cast<float>(height - 1))
+                       ? x_c[ys * width + xw]
+                       : 0.f;
+        float es = inbound(xe,
+                           ys,
+                           static_cast<float>(width - 1),
+                           static_cast<float>(height - 1))
+                       ? x_c[ys * width + xe]
+                       : 0.f;
+
+        out_c[s] = wn * de * ds + en * dw * ds + ws * de * dn + es * dw * dn;
+      }
+    }
+  }
+}
+// #define GRID_FP16_LOOP_TEST
+// #define GRID_FP16_PRINT_RESULT
+TEST(grid_samler_image2d, compute) {
+#ifdef GRID_FP16_LOOP_TEST
+  for (int n = 1; n <= 100; n += 33) {
+    for (auto c : {1, 3, 8, 23, 32}) {
+      for (int h = 12; h <= 100; h += 13) {
+        for (int w = 12; w <= 100; w += 25) {
+#else
+  const int n = 1;
+  const int c = 2;
+  const int h = 4;
+  const int w = 4;
+#endif  // GRID_FP16_LOOP_TEST
+
+          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
+                    << h << " " << w << " ========";
+
+          auto kernels =
+              KernelRegistry::Global().Create("grid_sampler",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kFP16),
+                                              DATALAYOUT(kImageDefault));
+          ASSERT_FALSE(kernels.empty());
+          auto kernel = std::move(kernels.front());
+          LOG(INFO) << "get kernel:" << kernel->doc();
+
+          lite::Tensor x, grid, out;
+          operators::GridSamplerParam param;
+          param.x = &x;
+          param.grid = &grid;
+          param.out = &out;
+
+          std::unique_ptr<KernelContext> context(new KernelContext);
+          context->As<OpenCLContext>().InitOnce();
+
+          kernel->SetParam(param);
+          std::unique_ptr<KernelContext> grid_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(grid_context->As<OpenCLContext>()));
+          kernel->SetContext(std::move(grid_context));
+
+          const DDim in_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+          const DDim grid_dim = DDim(std::vector<DDim::value_type>{n, h, w, 2});
+          const DDim out_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+          x.Resize(in_dim);
+          grid.Resize(grid_dim);
+          out.Resize(out_dim);
+
+          std::default_random_engine engine;
+          std::uniform_real_distribution<float> dist(-1, 1);
+          int sum = n * c * h * w;
+          int sum2 = n * h * w * 2;
+          std::vector<float> input_v(sum);
+          std::vector<float> grid_v(sum2);
+          for (auto& i : input_v) {
+            i = dist(engine);
+          }
+          for (auto& i : grid_v) {
+            i = dist(engine);
+          }
+
+          LOG(INFO) << "prepare input";
+          CLImageConverterDefault* default_converter =
+              new CLImageConverterDefault();
+          DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
+          LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
+                    << x_image_shape[1];
+          std::vector<half_t> x_image_data(x_image_shape.production() *
+                                           4);  // 4 : RGBA
+          default_converter->NCHWToImage(
+              input_v.data(), x_image_data.data(), in_dim);
+          auto* x_image = x.mutable_data<half_t, cl::Image2D>(
+              x_image_shape[0], x_image_shape[1], x_image_data.data());
+          // LOG(INFO) << "x_image:" << x_image;
+
+          DDim grid_image_shape =
+              default_converter->InitImageDimInfoWith(grid_dim);
+          LOG(INFO) << "grid_image_shape = " << grid_image_shape[0] << " "
+                    << grid_image_shape[1];
+          std::vector<half_t> grid_image_data(grid_image_shape.production() *
+                                              4);  // 4 : RGBA
+          default_converter->NCHWToImage(
+              grid_v.data(), grid_image_data.data(), grid_dim);
+          auto* grid_image = grid.mutable_data<half_t, cl::Image2D>(
+              grid_image_shape[0], grid_image_shape[1], grid_image_data.data());
+          // LOG(INFO) << "grid_image:" << grid_image;
+
+          DDim out_image_shape =
+              default_converter->InitImageDimInfoWith(out_dim);
+          LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+                    << out_image_shape[1];
+          auto* out_image = out.mutable_data<half_t, cl::Image2D>(
+              out_image_shape[0], out_image_shape[1]);
+          // LOG(INFO) << "out_image:" << out_image;
+          kernel->Launch();
+
+          auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+          auto* out_ptr = param.out->data<half_t, cl::Image2D>();
+          auto it = wait_list->find(out_ptr);
+          if (it != wait_list->end()) {
+            VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+            auto& event = *(it->second);
+            event.wait();
+          } else {
+            LOG(FATAL)
+                << "Could not find the sync event for the target cl tensor.";
+          }
+
+          std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
+          gird_sampler_ref(
+              input_v.data(), in_dim, grid_v.data(), out_ref.get());
+
+          const size_t cl_image2d_row_pitch{0};
+          const size_t cl_image2d_slice_pitch{0};
+          half_t* out_image_data = new half_t[out_image_shape.production() * 4];
+          TargetWrapperCL::ImgcpySync(out_image_data,
+                                      out_image,
+                                      out_image_shape[0],
+                                      out_image_shape[1],
+                                      cl_image2d_row_pitch,
+                                      cl_image2d_slice_pitch,
+                                      IoDirection::DtoH);
+          float* out_data = new float[out_image_shape.production() * 4];
+          default_converter->ImageToNCHW(
+              out_image_data, out_data, out_image_shape, out_dim);
+// result
+#ifdef GRID_FP16_PRINT_RESULT
+          LOG(INFO) << "---- print kernel result (input -> output) ----";
+          for (int eidx = 0; eidx < in_dim.production(); ++eidx) {
+            std::cout << input_v[eidx] << " -> " << out_data[eidx] << "\n";
+          }
+#endif  // GRID_FP16_PRINT_RESULT
+          for (int i = 0; i < out_dim.production(); i++) {
+            auto abs_diff = abs(out_data[i] - out_ref[i]);
+            auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]);
+            EXPECT_EQ(
+                (relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+                true);
+            if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+              LOG(ERROR) << "error idx:" << i << " out_data[" << i
+                         << "]:" << out_data[i] << " "
+                                                   "out_ref["
+                         << i << "]:" << out_ref[i] << " abs_diff:" << abs_diff
+                         << " relative_diff:" << relative_diff
+                         << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+            }
+          }
+#ifdef GRID_FP16_LOOP_TEST
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(grid_sampler, kOpenCL, kFP16, kImageDefault, ImageDefault);
diff --git a/lite/backends/opencl/cl_im2col_test.cc b/lite/kernels/opencl/im2col_buffer_test.cc
similarity index 100%
rename from lite/backends/opencl/cl_im2col_test.cc
rename to lite/kernels/opencl/im2col_buffer_test.cc
diff --git a/lite/kernels/opencl/instance_norm_image_compute.cc b/lite/kernels/opencl/instance_norm_image_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6bdec0ca6cdfd16219becf704de4d5701aad3197
--- /dev/null
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
@@ -0,0 +1,303 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/logging.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
+                                                   PRECISION(kFP16),
+                                                   DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::InstanceNormParam;
+
+  std::string doc() const override {
+    return "InstanceNorm using cl::Image2D(ImageDefault/RGBA), kFP16";
+  }
+
+#if 1  // onnx/pytorch version
+  void PrepareForRun() override {
+    instance_norm_param_ = param_.get_mutable<param_t>();
+    auto out = instance_norm_param_->out;
+    auto out_dims = out->dims();
+    const int out_n = out_dims[0];
+    const int out_c = out_dims[1];
+    const int out_h = out_dims[2];
+    const int out_w = out_dims[3];
+    const int c_group = (out_dims[1] + 3) / 4;
+
+    // TODO(ysh329): add instance_norm + relu pass
+    // std::string build_options_ += "-DRELU";
+    if (out_h == 128) {
+      build_options_ += " -DLOCAL_MEM_128";
+    } else if (out_h == 64) {
+      build_options_ += " -DLOCAL_MEM_64";
+    } else if (out_h > 256) {
+      LOG(FATAL) << "Unsupported input height:" << out_h << " of instance norm";
+    }
+
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/instance_norm_kernel.cl", build_options_);
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+  }
+
+  void Run() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+
+    auto* x = instance_norm_param_->x;
+    auto* out = instance_norm_param_->out;
+    auto x_dims = x->dims();
+    auto out_dims = out->dims();
+
+    const int out_n = out_dims[0];
+    const int out_c_group = (out_dims[1] + 3) / 4;
+    const int out_h = out_dims[2];
+    const int out_w = out_dims[3];
+
+    float epsilon = instance_norm_param_->epsilon;
+    auto device_info = CLRuntime::Global()->GetDeviceInfo();
+    int max_work_item_size1 = device_info["CL_DEVICE_MAX_WORK_ITEM_SIZES_1"];
+    int lws0 = 1;
+    int lws1 =
+        std::min(static_cast<int>(max_work_item_size1), std::min(256, out_w));
+    int lws2 = 1;
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(out_n * out_c_group),
+                    static_cast<cl::size_type>(lws1),
+                    static_cast<cl::size_type>(lws2)};
+    auto local_work_size = cl::NDRange{static_cast<cl::size_type>(lws0),
+                                       static_cast<cl::size_type>(lws1),
+                                       static_cast<cl::size_type>(lws2)};
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "global_work_size:" << static_cast<int>(global_work_size[0])
+            << " " << static_cast<int>(global_work_size[1]) << " "
+            << static_cast<int>(global_work_size[2]);
+    VLOG(4) << "local_work_size:" << static_cast<int>(local_work_size[0]) << " "
+            << static_cast<int>(local_work_size[1]) << " "
+            << static_cast<int>(local_work_size[2]);
+    VLOG(4) << "out_w:" << out_w;
+    VLOG(4) << "out_h:" << out_h;
+    VLOG(4) << "out_c_group:" << out_c_group;
+    VLOG(4) << "lws1:" << lws1;
+    VLOG(4) << "lws2:" << lws2;
+    VLOG(4) << "epsilon:" << epsilon;
+#endif
+
+    auto out_image_shape = InitImageDimInfoWith(out_dims);
+    auto* x_img = x->data<half_t, cl::Image2D>();
+    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
+        out_image_shape["width"], out_image_shape["height"]);
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    cl_int status = kernel.setArg(0, out_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(1, out_h);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(2, out_c_group);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(3, lws1);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(4, lws2);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(5, epsilon);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(6, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(7, *out_img);
+    CL_CHECK_FATAL(status);
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        local_work_size,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+  }
+
+#else  // paddle version
+  void PrepareForRun() override {
+    instance_norm_param_ = param_.get_mutable<param_t>();
+    auto channel = instance_norm_param_->scale->dims()[0];
+    auto batch = instance_norm_param_->x->dims()[0];
+    int64_t cgroup = (channel + 3) / 4;
+    int64_t cround = cgroup * 4;
+    std::vector<half_t> scale_img(cround * batch);
+    std::vector<half_t> bias_img(cround * batch);
+    const float* scale_data = instance_norm_param_->scale->data<float>();
+    const float* bias_data = instance_norm_param_->bias->data<float>();
+    //! init scale_img bias_img data
+    for (int i = 0; i < channel; ++i) {
+      scale_img[i] = Float2Half(scale_data[i]);
+      bias_img[i] = Float2Half(bias_data[i]);
+    }
+    for (int i = channel; i < cround; ++i) {
+      scale_img[i] = Float2Half(0.f);
+      bias_img[i] = Float2Half(0.f);
+    }
+    for (int i = 1; i < batch; ++i) {
+      memcpy(scale_img.data() + i * cround,
+             scale_img.data(),
+             cround * sizeof(half_t));
+      memcpy(bias_img.data() + i * cround,
+             bias_img.data(),
+             cround * sizeof(half_t));
+    }
+    DDim scale_img_size{{cgroup, batch}};
+    scale_image_.mutable_data<half_t, cl::Image2D>(
+        scale_img_size[0], scale_img_size[1], scale_img.data());
+    bias_image_.mutable_data<half_t, cl::Image2D>(
+        scale_img_size[0], scale_img_size[1], bias_img.data());
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/instance_norm_kernel.cl", build_options_);
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+  }
+
+  void Run() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    auto* x = instance_norm_param_->x;
+    auto* out = instance_norm_param_->out;
+    auto in_dims = x->dims();
+
+    int batch = in_dims[0];
+    int channel = in_dims[1];
+    int in_h = in_dims[2];
+    int in_w = in_dims[3];
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "x->target():" << TargetToStr(x->target());
+    VLOG(4) << "out->target():" << TargetToStr(out->target());
+    VLOG(4) << "x->dims():" << in_dims;
+#endif
+
+    auto out_image_shape = InitImageDimInfoWith(in_dims);
+    auto* x_img = x->data<half_t, cl::Image2D>();
+    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
+        out_image_shape["width"], out_image_shape["height"]);
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " "
+            << out_image_shape["height"];
+
+    VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w;
+#endif
+
+    int threads = 512;
+    int group_size_x = (channel + 3) / 4;
+    int group_size_y = batch;
+    auto local_work_size = cl::NDRange{static_cast<cl::size_type>(threads),
+                                       static_cast<cl::size_type>(1),
+                                       static_cast<cl::size_type>(1)};
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(group_size_x * threads),
+                    static_cast<cl::size_type>(group_size_y),
+                    static_cast<cl::size_type>(1)};
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "local_work_size:[2D]:" << local_work_size[0] << " "
+            << local_work_size[1] << " " << local_work_size[2];
+    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
+            << global_work_size[1] << " " << global_work_size[2];
+#endif
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    auto* scale_img = scale_image_.data<half_t, cl::Image2D>();
+    auto* bias_img = bias_image_.data<half_t, cl::Image2D>();
+    float epsilon = instance_norm_param_->epsilon;
+
+    cl_int status = kernel.setArg(arg_idx++, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, *scale_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, *bias_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, epsilon);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, in_h);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, in_w);
+    CL_CHECK_FATAL(status);
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        local_work_size,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+  }
+#endif
+
+ protected:
+  param_t* instance_norm_param_{nullptr};
+  std::string kernel_func_name_{"instance_norm_onnx"};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+  Tensor scale_image_;
+  Tensor bias_image_;
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+namespace ocl = paddle::lite::kernels::opencl;
+REGISTER_LITE_KERNEL(instance_norm,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     ocl::InstanceNormImageCompute,
+                     ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Y",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/opencl/instance_norm_image_compute_test.cc b/lite/kernels/opencl/instance_norm_image_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..63d172f5edc4b492c1a8250d2d3c94cd1d594d87
--- /dev/null
+++ b/lite/kernels/opencl/instance_norm_image_compute_test.cc
@@ -0,0 +1,240 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <memory>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/profile/timer.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+
+#define FP16_MAX_DIFF (5e-3)
+DEFINE_int32(warmup, 0, "warmup times");
+DEFINE_int32(repeats, 1, "repeats times");
+
+using paddle::lite::profile::Timer;
+
+namespace paddle {
+namespace lite {
+void instance_norm_ref(Tensor* x,
+                       Tensor* y,
+                       Tensor* scale,
+                       Tensor* bias,
+                       Tensor* saved_mean,
+                       Tensor* saved_variance,
+                       float epsilon) {
+  auto x_data = x->data<float>();
+  auto scale_data = scale->data<float>();
+  auto bias_data = bias->data<float>();
+  auto y_data = y->mutable_data<float>();
+  auto saved_mean_data = saved_mean->mutable_data<float>();
+  auto saved_variance_data = saved_variance->mutable_data<float>();
+  int n = x->dims()[0];
+  int c = x->dims()[1];
+  int spatial_size = x->dims()[2] * x->dims()[3];
+
+  // compute mean
+  for (int i = 0; i < n * c; ++i) {
+    const float* x_ptr = x_data + i * spatial_size;
+    float sum = 0.f;
+    for (int j = 0; j < spatial_size; ++j) {
+      sum += x_ptr[j];
+    }
+    saved_mean_data[i] = sum / spatial_size;
+  }
+  // compute variance
+  for (int i = 0; i < n * c; ++i) {
+    const float* x_ptr = x_data + i * spatial_size;
+    float sum = 0.f;
+    for (int j = 0; j < spatial_size; ++j) {
+      sum += (x_ptr[j] - saved_mean_data[i]) * (x_ptr[j] - saved_mean_data[i]);
+    }
+    saved_variance_data[i] = 1.f / sqrtf(sum / spatial_size + epsilon);
+  }
+  // compute out
+  for (int i = 0; i < n * c; ++i) {
+    const float* x_ptr = x_data + i * spatial_size;
+    float* y_ptr = y_data + i * spatial_size;
+    float scale_val = scale_data[i % c];
+    float bias_val = bias_data[i % c];
+    for (int j = 0; j < spatial_size; ++j) {
+      y_ptr[j] =
+          scale_val * (x_ptr[j] - saved_mean_data[i]) * saved_variance_data[i] +
+          bias_val;
+    }
+  }
+}
+
+// #define INSTANCE_NORM_FP16_LOOP_TEST
+// #define INSTANCE_NORM_FP16_PRINT_RESULT
+TEST(instance_norm_image2d, compute) {
+#ifdef INSTANCE_NORM_FP16_LOOP_TEST
+  for (auto n : {1, 3}) {
+    for (auto c : {1, 3, 8, 32, 65}) {
+      for (auto h : {4, 20, 64, 112, 224}) {
+        for (auto w : {2, 20, 64, 112, 224}) {
+#else
+  const int n = 1;
+  const int c = 32;
+  const int h = 224;
+  const int w = 224;
+#endif  // INSTANCE_NORM_FP16_LOOP_TEST
+
+          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
+                    << h << " " << w << " ========";
+
+          auto kernels =
+              KernelRegistry::Global().Create("instance_norm",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kFP16),
+                                              DATALAYOUT(kImageDefault));
+          ASSERT_FALSE(kernels.empty());
+          auto kernel = std::move(kernels.front());
+          LOG(INFO) << "get kernel:" << kernel->doc();
+
+          lite::Tensor x, out, out_ref, scale, bias, saved_mean, saved_variance;
+          operators::InstanceNormParam param;
+          param.x = &x;
+          param.out = &out;
+          param.scale = &scale;
+          param.bias = &bias;
+          param.saved_mean = &saved_mean;
+          param.saved_variance = &saved_variance;
+          param.epsilon = 1e-5;
+          std::unique_ptr<KernelContext> context(new KernelContext);
+          context->As<OpenCLContext>().InitOnce();
+
+          kernel->SetParam(param);
+          std::unique_ptr<KernelContext> instance_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(instance_context->As<OpenCLContext>()));
+          kernel->SetContext(std::move(instance_context));
+
+          const DDim in_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+          x.Resize(in_dim);
+          out.Resize(in_dim);
+          out_ref.Resize(in_dim);
+          scale.Resize({c});
+          bias.Resize({c});
+          saved_mean.Resize({n * c});
+          saved_variance.Resize({n * c});
+          auto* x_data = x.mutable_data<float>();
+          auto* scale_data = scale.mutable_data<float>();
+          auto* bias_data = bias.mutable_data<float>();
+          auto* saved_mean_data = saved_mean.mutable_data<float>();
+          auto* saved_variance_data = saved_variance.mutable_data<float>();
+          std::default_random_engine engine;
+          std::uniform_real_distribution<float> dist(-1, 1);
+          int sum = n * c * h * w;
+          for (int i = 0; i < sum; ++i) {
+            x_data[i] = dist(engine);
+          }
+          for (int i = 0; i < c; ++i) {
+            scale_data[i] = dist(engine);
+            bias_data[i] = dist(engine);
+          }
+          //! run reference instance norm
+          instance_norm_ref(
+              &x, &out_ref, &scale, &bias, &saved_mean, &saved_variance, 1e-5);
+          LOG(INFO) << "prepare input";
+          CLImageConverterDefault* default_converter =
+              new CLImageConverterDefault();
+          DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
+          LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
+                    << x_image_shape[1];
+          std::vector<half_t> x_image_data(x_image_shape.production() *
+                                           4);  // 4 : RGBA
+          default_converter->NCHWToImage(x_data, x_image_data.data(), in_dim);
+          auto* x_image = x.mutable_data<half_t, cl::Image2D>(
+              x_image_shape[0], x_image_shape[1], x_image_data.data());
+
+          auto* out_image = out.mutable_data<half_t, cl::Image2D>(
+              x_image_shape[0], x_image_shape[1]);
+
+          //! warm up
+          for (int i = 0; i < FLAGS_warmup; ++i) {
+            kernel->Launch();
+          }
+          context->As<OpenCLContext>().cl_context()->GetCommandQueue().finish();
+          //! compute
+          Timer t0;
+          t0.Start();
+          for (int i = 0; i < FLAGS_repeats; ++i) {
+            kernel->Launch();
+          }
+          context->As<OpenCLContext>().cl_context()->GetCommandQueue().finish();
+          t0.Stop();
+          double gops = 6 * sum;
+          LOG(INFO) << "avg time: " << t0.LapTimes().Avg() / FLAGS_repeats
+                    << " ms, "
+                    << "avg GOPs: "
+                    << 1e-6 * gops * FLAGS_repeats / t0.LapTimes().Avg()
+                    << " GOPs";
+          const size_t cl_image2d_row_pitch{0};
+          const size_t cl_image2d_slice_pitch{0};
+          half_t* out_image_data = new half_t[x_image_shape.production() * 4];
+          TargetWrapperCL::ImgcpySync(out_image_data,
+                                      out_image,
+                                      x_image_shape[0],
+                                      x_image_shape[1],
+                                      cl_image2d_row_pitch,
+                                      cl_image2d_slice_pitch,
+                                      IoDirection::DtoH);
+          float* out_data = new float[x_image_shape.production() * 4];
+          default_converter->ImageToNCHW(
+              out_image_data, out_data, x_image_shape, in_dim);
+// result
+#ifdef INSTANCE_NORM_FP16_PRINT_RESULT
+          LOG(INFO) << "---- print kernel result (input -> output) ----";
+          for (int eidx = 0; eidx < in_dim.production(); ++eidx) {
+            std::cout << x_data[eidx] << " -> " << out_data[eidx] << std::endl;
+          }
+#endif  // INSTANCE_NORM_FP16_PRINT_RESULT
+          auto* out_ref_data = out_ref.data<float>();
+          for (int i = 0; i < in_dim.production(); i++) {
+            auto abs_diff = abs(out_data[i] - out_ref_data[i]);
+            auto relative_diff =
+                COMPUTE_RELATIVE_DIFF(out_data[i], out_ref_data[i]);
+            EXPECT_EQ(
+                (relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+                true);
+            if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+              LOG(ERROR) << "error idx:" << i << ", in_data[" << i
+                         << "]: " << x_data[i] << ", out_data[" << i
+                         << "]: " << out_data[i] << ", out_ref[" << i
+                         << "]: " << out_ref_data[i]
+                         << ", abs_diff: " << abs_diff
+                         << ", relative_diff: " << relative_diff
+                         << ", FP16_MAX_DIFF: " << FP16_MAX_DIFF;
+            }
+          }
+          delete[] out_data;
+          delete[] out_image_data;
+#ifdef INSTANCE_NORM_FP16_LOOP_TEST
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(instance_norm, kOpenCL, kFP16, kImageDefault, ImageDefault);
diff --git a/lite/kernels/opencl/io_copy_compute.cc b/lite/kernels/opencl/io_copy_buffer_compute.cc
similarity index 80%
rename from lite/kernels/opencl/io_copy_compute.cc
rename to lite/kernels/opencl/io_copy_buffer_compute.cc
index 3387a0887d3422636e39e742149f84672e8e75d4..6a49cc2577a58690e5e0b6a6ede82df0bdc99bb1 100644
--- a/lite/kernels/opencl/io_copy_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
@@ -42,16 +42,13 @@ class IoCopyHostToOpenCLCompute
     CHECK(param.x->target() == TARGET(kHost) ||
           param.x->target() == TARGET(kARM));
     auto mem_size = param.x->memory_size();
-
-    VLOG(4) << "copy size " << mem_size;
-    VLOG(4) << "param.x->dims().size():" << param.x->dims().size();
-    VLOG(4) << "param.x->dims():" << param.x->dims()[0] << " "
-            << param.x->dims()[1] << " " << param.x->dims()[2] << " "
-            << param.x->dims()[3];
-    VLOG(4) << "param.y->dims().size():" << param.y->dims().size();
-    VLOG(4) << "param.y->dims():" << param.y->dims()[0] << " "
-            << param.y->dims()[1] << " " << param.y->dims()[2] << " "
-            << param.y->dims()[3];
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(2) << "param.x->memory_size():" << mem_size;
+    VLOG(2) << "param.x->dims().size():" << param.x->dims().size();
+    VLOG(2) << "param.x->dims():" << param.x->dims();
+    VLOG(2) << "param.y->dims().size():" << param.y->dims().size();
+    VLOG(2) << "param.y->dims():" << param.y->dims();
+#endif
     auto* data = param.y->mutable_data(TARGET(kOpenCL), mem_size);
     CopyFromHostSync(data, param.x->raw_data(), mem_size);
   }
@@ -89,32 +86,36 @@ class IoCopykOpenCLToHostCompute
     auto& param = Param<operators::IoCopyParam>();
     CHECK(param.x->target() == TARGET(kOpenCL));
     auto mem_size = param.x->memory_size();
-    VLOG(4) << "copy size " << mem_size;
-    VLOG(4) << "param.x->dims().size():" << param.x->dims().size();
-    VLOG(4) << "param.x->dims():" << param.x->dims()[0] << " "
-            << param.x->dims()[1] << " " << param.x->dims()[2] << " "
-            << param.x->dims()[3];
-    VLOG(4) << "param.y->dims().size():" << param.y->dims().size();
-    VLOG(4) << "param.y->dims():" << param.y->dims()[0] << " "
-            << param.y->dims()[1] << " " << param.y->dims()[2] << " "
-            << param.y->dims()[3];
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(2) << "copy size " << mem_size;
+    VLOG(2) << "param.x->dims().size():" << param.x->dims().size();
+    VLOG(2) << "param.x->dims():" << param.x->dims();
+    VLOG(2) << "param.y->dims().size():" << param.y->dims().size();
+    VLOG(2) << "param.y->dims():" << param.y->dims();
+    VLOG(2) << "param.process_type:" << param.process_type;
+#endif
+
     auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
+    const cl::Buffer* x_ptr;
+    if (param.process_type == 1) {
+      x_ptr = param.x->data<uint8_t, cl::Buffer>();
+    } else {
+      x_ptr = param.x->data<float, cl::Buffer>();
+    }
+
     auto& context = ctx_->As<OpenCLContext>();
     auto* wait_list = context.cl_wait_list();
-    auto* x_ptr = param.x->data<float, cl::Buffer>();
-
-    /* TODO(ysh329): io_copy(device->host) jammed if `it` emplaced to
-    `cl_wait_list`
-    in kernel and `wait_list` enabled
     auto it = wait_list->find(x_ptr);
     if (it != wait_list->end()) {
-      VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+#ifndef LITE_SHUTDOWN_LOG
+      VLOG(2) << "--- Find the sync event for the target cl tensor. ---";
+#endif
       auto& event = *(it->second);
       event.wait();
     } else {
       LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
     }
-    */
 
     CopyToHostSync(data, param.x->raw_data(), mem_size);
   }
diff --git a/lite/kernels/opencl/io_copy_compute_test.cc b/lite/kernels/opencl/io_copy_buffer_compute_test.cc
similarity index 100%
rename from lite/kernels/opencl/io_copy_compute_test.cc
rename to lite/kernels/opencl/io_copy_buffer_compute_test.cc
diff --git a/lite/kernels/opencl/layout_compute.cc b/lite/kernels/opencl/layout_image_compute.cc
similarity index 73%
rename from lite/kernels/opencl/layout_compute.cc
rename to lite/kernels/opencl/layout_image_compute.cc
index 2214a775b7703002cdde0c01867192c50607a66c..22b3533e123bc248b0ec59df593cd51fe0ad1391 100644
--- a/lite/kernels/opencl/layout_compute.cc
+++ b/lite/kernels/opencl/layout_image_compute.cc
@@ -15,6 +15,7 @@
 #include <memory>
 #include <string>
 #include "lite/api/paddle_place.h"
+#include "lite/backends/opencl/cl_half.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/target_wrapper.h"
@@ -37,17 +38,27 @@ class LayoutComputeBufferChwToImageDefault
   using param_t = operators::LayoutParam;
 
   void PrepareForRun() override {
+    auto& param = Param<param_t>();
+    if (param.process_type == 1) {
+      kernel_func_name_ = "buffer_to_image2d_with_pre255";
+    }
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
     auto& context = ctx_->As<OpenCLContext>();
     context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/layout_kernel.cl", build_options_);
+        kernel_func_name_, "image/layout_kernel.cl", build_options_);
   }
 
   void Run() override {
     auto& param = Param<param_t>();
-    auto* x_data = param.x->data<float, cl::Buffer>();
+    const cl::Buffer* x_data;
+    if (param.process_type == 1) {
+      x_data = param.x->data<uint8_t, cl::Buffer>();
+    } else {
+      x_data = param.x->data<float, cl::Buffer>();
+    }
     auto x_dims = param.x->dims();
     auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* y_data = param.y->mutable_data<float, cl::Image2D>(
+    auto* y_data = param.y->mutable_data<half_t, cl::Image2D>(
         image_shape["width"], image_shape["height"]);
     auto y_dims = param.y->dims();
 
@@ -63,18 +74,23 @@ class LayoutComputeBufferChwToImageDefault
     const int Stride1 = out_H * out_W;
     const int Stride0 = out_W;
 
-    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
-            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
-    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
-            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
-    VLOG(4) << "new_dims[" << new_dims.size() << "D]:" << new_dims[0] << " "
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(2) << "param.process_type:" << param.process_type;
+    VLOG(2) << "x_dims:" << x_dims;
+    VLOG(2) << "param.x->memory_size():" << param.x->memory_size();
+    VLOG(2) << "new_dims[" << new_dims.size() << "D]:" << new_dims[0] << " "
             << new_dims[1] << " " << new_dims[2] << " " << new_dims[3];
-    VLOG(4) << "out_C:" << out_C;
-    VLOG(4) << "out_H:" << out_H;
-    VLOG(4) << "out_W:" << out_W;
-    VLOG(4) << "Stride2:" << Stride2;
-    VLOG(4) << "Stride1:" << Stride1;
-    VLOG(4) << "Stride0:" << Stride0;
+    VLOG(2) << "y_dims:" << y_dims;
+    VLOG(2) << "param.y->memory_size():" << param.y->memory_size();
+    VLOG(2) << "y image_shape(w,h):" << image_shape["width"] << " "
+            << image_shape["height"];
+    VLOG(2) << "out_C:" << out_C;
+    VLOG(2) << "out_H:" << out_H;
+    VLOG(2) << "out_W:" << out_W;
+    VLOG(2) << "Stride2:" << Stride2;
+    VLOG(2) << "Stride1:" << Stride1;
+    VLOG(2) << "Stride0:" << Stride0;
+#endif
 
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
@@ -100,7 +116,7 @@ class LayoutComputeBufferChwToImageDefault
     status = kernel.setArg(++arg_idx, static_cast<const int>(Stride2));
     CL_CHECK_FATAL(status);
 
-    VLOG(4) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3]
+    VLOG(2) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3]
             << " " << (new_dims[0] * new_dims[2]);
     auto global_work_size =
         cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
@@ -114,19 +130,17 @@ class LayoutComputeBufferChwToImageDefault
         nullptr,
         event_.get());
     CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(y_data, event_);
-    context.cl_context()->GetCommandQueue().finish();
+    context.cl_wait_list()->emplace(y_data, event_);
   }
 
   std::string doc() const override {
     return "Trans Layout from cl::Buffer(NCHW) to "
-           "cl::Image2D(ImageDefault/RGBA)";
+           "cl::Image2D(ImageDefault/RGBA), Float ---> FP16";
   }
 
  private:
   std::string kernel_func_name_{"buffer_to_image2d"};
-  std::string build_options_{"-DCL_DTYPE_float "};
+  std::string build_options_{"-DCL_DTYPE_float"};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
@@ -137,29 +151,45 @@ class LayoutComputeImageDefaultToBufferChw
   using param_t = operators::LayoutParam;
 
   void PrepareForRun() override {
+    auto& param = Param<param_t>();
+    if (param.process_type == 1) {
+      kernel_func_name_ = "image2d_to_buffer_with_post255";
+    }
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
     auto& context = ctx_->As<OpenCLContext>();
     context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/layout_kernel.cl", build_options_);
+        kernel_func_name_, "image/layout_kernel.cl", build_options_);
   }
 
   void Run() override {
     auto& param = Param<param_t>();
-    auto* y_data = param.y->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-    auto y_dims = param.y->dims();
-    auto* x_data = param.x->data<float, cl::Image2D>();
+    const cl::Buffer* y_data;
+    if (param.process_type == 1) {
+      y_data = param.y->mutable_data<uint8_t, cl::Buffer>(TARGET(kOpenCL));
+    } else {
+      y_data = param.y->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+    }
+    auto* x_data = param.x->data<half_t, cl::Image2D>();
     auto x_dims = param.x->dims();
+    auto y_dims = param.y->dims();
+    auto x_image_shape = InitImageDimInfoWith(x_dims);
 
     std::vector<size_t> new_dims = {1, 1, 1, 1};
     for (int j = 0; j < x_dims.size(); ++j) {
       new_dims[4 - x_dims.size() + j] = x_dims[j];
     }
 
-    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
-            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
-    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
-            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
-    VLOG(4) << "new_dims[" << new_dims.size() << "D]:" << new_dims[0] << " "
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(2) << "param.process_type:" << param.process_type;
+    VLOG(2) << "x_dims:" << x_dims;
+    VLOG(2) << "param.x->memory_size():" << param.x->memory_size();
+    VLOG(2) << "x_image_shape(w,h):" << x_image_shape["width"] << " "
+            << x_image_shape["height"];
+    VLOG(2) << "new_dims[" << new_dims.size() << "D]:" << new_dims[0] << " "
             << new_dims[1] << " " << new_dims[2] << " " << new_dims[3];
+    VLOG(2) << "y_dims:" << y_dims;
+    VLOG(2) << "param.y->memory_size():" << param.y->memory_size();
+#endif
 
     size_t C = new_dims[1];
     size_t in_height = new_dims[2];
@@ -191,8 +221,10 @@ class LayoutComputeImageDefaultToBufferChw
     CL_CHECK_FATAL(status);
     status = kernel.setArg(++arg_idx, static_cast<const int>(C));
     CL_CHECK_FATAL(status);
-    VLOG(4) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3]
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(2) << "gws:[3D]" << ((new_dims[1] + 3) / 4) << " " << new_dims[3]
             << " " << (new_dims[0] * new_dims[2]);
+#endif
     auto global_work_size =
         cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
                     static_cast<cl::size_type>(new_dims[3]),
@@ -205,14 +237,12 @@ class LayoutComputeImageDefaultToBufferChw
         nullptr,
         event_.get());
     CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(y_data, event_);
-    context.cl_context()->GetCommandQueue().finish();
+    context.cl_wait_list()->emplace(y_data, event_);
   }
 
   std::string doc() const override {
     return "Trans Layout from cl::Image2D(ImageDefault/RGBA) to "
-           "cl::Buffer(NCHW)";
+           "cl::Buffer(NCHW), FP16 ---> Float";
   }
 
  private:
@@ -287,7 +317,7 @@ class LayoutComputeBufferChwToImage2DNw
     status = kernel.setArg(++arg_idx, static_cast<const int>(Stride2));
     CL_CHECK_FATAL(status);
 
-    VLOG(4) << "gws:[3D]" << ((out_N + 3) / 4) << " " << out_W << " "
+    VLOG(2) << "gws:[3D]" << ((out_N + 3) / 4) << " " << out_W << " "
             << (out_C * out_H);
     auto global_work_size =
         cl::NDRange{static_cast<cl::size_type>((out_N + 3) / 4),  // N blocks
@@ -301,10 +331,7 @@ class LayoutComputeBufferChwToImage2DNw
         nullptr,
         event_.get());
     CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(y_data, event_);
-    context.cl_context()->GetCommandQueue().finish();
-    //    auto image_shape = InitImageDimInfoWith(x_dims);
+    context.cl_wait_list()->emplace(y_data, event_);
   }
 
   std::string doc() const override {
@@ -340,23 +367,6 @@ REGISTER_LITE_KERNEL(
                                        DATALAYOUT(kImageDefault))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(
-    layout_once,
-    kOpenCL,
-    kAny,
-    kImageDefault,
-    paddle::lite::kernels::opencl::LayoutComputeBufferChwToImageDefault,
-    NCHW_to_ImageDefault)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kNCHW))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kAny),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
-
 // [ImageDefault] -> [NCHW]
 REGISTER_LITE_KERNEL(
     layout,
@@ -374,38 +384,3 @@ REGISTER_LITE_KERNEL(
                                        PRECISION(kAny),
                                        DATALAYOUT(kNCHW))})
     .Finalize();
-
-REGISTER_LITE_KERNEL(
-    layout_once,
-    kOpenCL,
-    kAny,
-    kNCHW,
-    paddle::lite::kernels::opencl::LayoutComputeImageDefaultToBufferChw,
-    ImageDefault_to_NCHW)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kAny),
-                                       DATALAYOUT(kNCHW))})
-    .Finalize();
-
-// [NCHW] -> [ImageNW]
-REGISTER_LITE_KERNEL(
-    layout_once,
-    kOpenCL,
-    kFloat,
-    kImageNW,
-    paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DNw,
-    NCHW_to_ImageNW)
-    .BindInput("Input",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kImageNW))})
-    .Finalize();
diff --git a/lite/kernels/opencl/layout_compute_test.cc b/lite/kernels/opencl/layout_image_compute_test.cc
similarity index 60%
rename from lite/kernels/opencl/layout_compute_test.cc
rename to lite/kernels/opencl/layout_image_compute_test.cc
index 852ac91bac610d26de948d1743d01e63c2d4d411..9cdfbe0a1d64176db9dfd2698ab3ab0631a4b118 100644
--- a/lite/kernels/opencl/layout_compute_test.cc
+++ b/lite/kernels/opencl/layout_image_compute_test.cc
@@ -18,6 +18,9 @@
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
 #include "lite/kernels/opencl/image_helper.h"
+#include "lite/kernels/opencl/test_helper.h"
+
+#define FP16_MAX_DIFF (1e0)
 
 namespace paddle {
 namespace lite {
@@ -29,15 +32,15 @@ TEST(layout_ImageDefault, compute) {
                "-> device";
 
 #ifdef LOOP_TEST
-  for (int n = 1; n <= 100; n += 21) {
+  for (int n = 1; n <= 2; n += 1) {
     for (auto c : {1, 3}) {
-      for (int h = 1; h <= 100; h += 13) {
-        for (int w = 1; w <= 100; w += 17) {
+      for (int h = 1; h <= 10; h += 1) {
+        for (int w = 1; w <= 10; w += 1) {
 #else
-  const int n = 2;
-  const int c = 9;
-  const int h = 20;
-  const int w = 5;
+  const int n = 1;
+  const int c = 2;
+  const int h = 3;
+  const int w = 4;
 #endif  // LOOP_TEST
 
           LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
@@ -79,14 +82,14 @@ TEST(layout_ImageDefault, compute) {
           auto* y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
           auto image_shape =
               paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
-          auto* y_image_data = y_image.mutable_data<float, cl::Image2D>(
+          auto* y_image_data = y_image.mutable_data<half_t, cl::Image2D>(
               image_shape["width"], image_shape["height"]);
           auto* mapped_x = static_cast<float*>(TargetWrapperCL::Map(
               x_data, 0, sizeof(float) * x_dim.production()));
           auto* mapped_y = static_cast<float*>(TargetWrapperCL::Map(
               y_data, 0, sizeof(float) * x_dim.production()));
           for (int i = 0; i < x_dim.production(); ++i) {
-            mapped_x[i] = static_cast<float>(i);
+            mapped_x[i] = static_cast<float>(i) * 0.01;
           }
 
           // set context and kernel args
@@ -116,15 +119,162 @@ TEST(layout_ImageDefault, compute) {
 #ifdef PRINT_RESULT
           LOG(INFO) << "---- print result ----";
           for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
-            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
-                      << std::endl;
+            std::cout << mapped_x[eidx] << " -> "
+                      << static_cast<float>(mapped_y[eidx]) << std::endl;
           }
 #endif  // PRINT_RESULT
 
           // check result: compare input and output
+          for (int i = 0; i < x_dim.production(); i++) {
+            auto abs_diff = COMPUTE_ABS_DIFF(mapped_x[i], mapped_y[i]);
+            auto relative_diff =
+                COMPUTE_RELATIVE_DIFF(mapped_x[i], mapped_y[i]);
+            EXPECT_EQ(
+                (relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+                true);
+            if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+              LOG(ERROR) << "error idx:" << i << " mapped_x[" << i
+                         << "]:" << mapped_x[i] << " mapped_y[" << i
+                         << "]:" << mapped_y[i] << " abs_diff:" << abs_diff
+                         << " relative_diff:" << relative_diff
+                         << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+              break;
+            }
+          }
+
+          // free
+          LOG(INFO) << "free: unmap x, y";
+          TargetWrapperCL::Unmap(x_data, mapped_x);
+          TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef LOOP_TEST
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
+
+TEST(layout_ImageDefault_With_Pre_Post, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> layout(img2buf) "
+               "-> device";
+
+#ifdef LOOP_TEST
+  for (int n = 1; n <= 2; n += 1) {
+    for (auto c : {1, 3}) {
+      for (int h = 1; h <= 10; h += 1) {
+        for (int w = 1; w <= 10; w += 1) {
+#else
+          const int n = 1;
+          const int c = 2;
+          const int h = 3;
+          const int w = 4;
+#endif  // LOOP_TEST
+
+          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
+                    << h << " " << w << " ========";
+          // set layout kernels
+          auto buf_to_img_kernels =
+              KernelRegistry::Global().Create("layout",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kAny),
+                                              DATALAYOUT(kImageDefault));
+          auto img_to_buf_kernels = KernelRegistry::Global().Create(
+              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+
+          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+
+          // set tensors about op param
+          LOG(INFO) << "set tensors about op param";
+          lite::Tensor x, y_image, y;
+          operators::LayoutParam BufferToImageParam;
+          operators::LayoutParam ImageToBufferParam;
+          BufferToImageParam.x = &x;
+          BufferToImageParam.y = &y_image;
+          BufferToImageParam.process_type = 1;
+          ImageToBufferParam.x = &y_image;
+          ImageToBufferParam.y = &y;
+          ImageToBufferParam.process_type = 1;
+
+          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+          x.Resize(x_dim);
+          y_image.Resize(x_dim);  // useless for image2D
+          y.Resize(x_dim);
+
+          // initialize tensors
+          LOG(INFO) << "initialize tensors";
+          auto* x_data = x.mutable_data<uint8_t, cl::Buffer>(TARGET(kOpenCL));
+          auto* y_data = y.mutable_data<uint8_t, cl::Buffer>(TARGET(kOpenCL));
+          auto image_shape =
+              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
+          auto* y_image_data = y_image.mutable_data<half_t, cl::Image2D>(
+              image_shape["width"], image_shape["height"]);
+          auto* mapped_x = static_cast<uint8_t*>(TargetWrapperCL::Map(
+              x_data, 0, sizeof(uint8_t) * x_dim.production()));
+          auto* mapped_y = static_cast<uint8_t*>(TargetWrapperCL::Map(
+              y_data, 0, sizeof(uint8_t) * x_dim.production()));
+          for (int i = 0; i < x_dim.production(); ++i) {
+            mapped_x[i] = static_cast<uint8_t>(i % 256);
+          }
+
+          // set context and kernel args
+          LOG(INFO) << "set context and kernel args";
+          std::unique_ptr<KernelContext> context(new KernelContext);
+          context->As<OpenCLContext>().InitOnce();
+
+          buf_to_img_kernel->SetParam(BufferToImageParam);
+          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(buf_to_img_context->As<OpenCLContext>()));
+          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+
+          img_to_buf_kernel->SetParam(ImageToBufferParam);
+          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(img_to_buf_context->As<OpenCLContext>()));
+          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+
+          // run kernels
+          LOG(INFO) << "run kernel: buffer_to_image2d_with_pre255";
+          buf_to_img_kernel->Launch();
+          LOG(INFO) << "run kernel: image2d_to_buffer_with_post255";
+          img_to_buf_kernel->Launch();
+
+          // wait for opencl
+          auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+          auto* out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
+          auto it = wait_list->find(out_ptr);
+
+          if (it != wait_list->end()) {
+            VLOG(4) << "--- Find the sync event for the target cl "
+                       "tensor. ---";
+            auto& event = *(it->second);
+            event.wait();
+          } else {
+            LOG(FATAL) << "Could not find the sync event for the target "
+                          "cl tensor.";
+          }
+
+// result
+#ifdef PRINT_RESULT
+          LOG(INFO) << "---- print result ----";
+          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+            std::cout << +mapped_x[eidx] << " -> "
+                      << +static_cast<uint8_t>(mapped_y[eidx]) << std::endl;
+          }
+#endif  // PRINT_RESULT
+
+          // check result: compare input and output
+          float MAX_PASS_DIFF = 1;
           for (int eidx = 0; eidx < x_dim.production(); eidx++) {
-            EXPECT_NEAR(mapped_x[eidx], mapped_y[eidx], 1e-6);
-            if (abs(mapped_x[eidx] - mapped_y[eidx]) > 1e-6) {
+            EXPECT_NEAR(mapped_x[eidx], mapped_y[eidx], MAX_PASS_DIFF);
+            if (abs(mapped_x[eidx] - mapped_y[eidx]) > MAX_PASS_DIFF) {
               LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
                         << " / " << x_dim.production() << ", mapped_x[" << eidx
                         << "]:" << mapped_x[eidx] << ", mapped_y[" << eidx
@@ -147,6 +297,7 @@ TEST(layout_ImageDefault, compute) {
 #endif
 }
 
+#if 0
 TEST(layout_ImageNW, compute) {
 #ifdef LOOP_TEST
   for (int n = 1; n <= 100; n += 21) {
@@ -282,9 +433,11 @@ TEST(layout_ImageNW, compute) {
 // nothing to do.
 #endif
 }
+#endif
+
 }  // namespace lite
 }  // namespace paddle
 
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
-USE_LITE_KERNEL(layout_once, kOpenCL, kFloat, kImageNW, NCHW_to_ImageNW);
+// USE_LITE_KERNEL(layout_once, kOpenCL, kFloat, kImageNW, NCHW_to_ImageNW);
diff --git a/lite/kernels/opencl/lrn_image_compute.cc b/lite/kernels/opencl/lrn_image_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..edce0368ddc9cda54fdab44b472fcd0e771413ae
--- /dev/null
+++ b/lite/kernels/opencl/lrn_image_compute.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/logging.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
+                                          PRECISION(kFP16),
+                                          DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::LrnParam;
+
+  std::string doc() const override {
+    return "Lrn using cl::Image2D(ImageDefault/RGBA), kFP16";
+  }
+
+  void PrepareForRun() override {
+    lrn_param_ = param_.get_mutable<param_t>();
+
+    auto& context = ctx_->As<OpenCLContext>();
+    n_ = lrn_param_->n;
+    k_ = lrn_param_->k;
+    alpha_ = lrn_param_->alpha;
+    beta_ = lrn_param_->beta;
+    norm_region_ = lrn_param_->norm_region;
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/lrn_kernel.cl", build_options_);
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+  }
+
+  void Run() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+
+    auto* x = lrn_param_->X;
+    auto* out = lrn_param_->Out;
+    if (norm_region_ != "AcrossChannels") {
+      LOG(FATAL) << "This norm_region_: " << norm_region_ << "doesn't support";
+      return;
+    }
+    auto out_dims = out->dims();
+    auto in_dims = x->dims();
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "x->target(): " << TargetToStr(x->target());
+    VLOG(4) << "out->target(): " << TargetToStr(out->target());
+    VLOG(4) << "x->dims(): " << in_dims;
+    VLOG(4) << "lrn param: ";
+    VLOG(4) << "n: " << n_;
+    VLOG(4) << "k: " << k_;
+    VLOG(4) << "alpha: " << alpha_;
+    VLOG(4) << "beta: " << beta_;
+    VLOG(4) << "norm_region: " << norm_region_;
+#endif
+
+    auto out_image_shape = InitImageDimInfoWith(out_dims);
+    auto* x_img = x->data<half_t, cl::Image2D>();
+    // VLOG(4) << "x_image: " << x_img;
+
+    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
+        out_image_shape["width"], out_image_shape["height"]);
+
+#ifndef LITE_SHUTDOWN_LOG
+    // VLOG(4) << "out_image" << out_img;
+    VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
+            << out_image_shape["height"];
+#endif
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    int out_channel = out_dims[1];
+    int out_width = out_dims[3];
+    auto default_work_size =
+        DefaultWorkSize(out_dims,
+                        DDim(std::vector<DDim::value_type>{
+                            static_cast<int64_t>(out_image_shape["width"]),
+                            static_cast<int64_t>(out_image_shape["height"])}));
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
+            << default_work_size[1] << ", " << default_work_size[3];
+#endif
+    cl_int status = kernel.setArg(arg_idx++, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, out_channel);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, out_width);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, n_);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, k_);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, alpha_);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, beta_);
+    CL_CHECK_FATAL(status);
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
+                    static_cast<cl::size_type>(default_work_size[1]),
+                    static_cast<cl::size_type>(default_work_size[2])};
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
+            << global_work_size[1] << " " << global_work_size[2];
+#endif
+  }
+
+ protected:
+  param_t* lrn_param_{nullptr};
+  int n_{5};
+  float alpha_{1e-4};
+  float beta_{0.75};
+  float k_{1.};
+  std::string norm_region_{"AcrossChannels"};
+  std::string kernel_func_name_{"lrn"};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+namespace ocl = paddle::lite::kernels::opencl;
+REGISTER_LITE_KERNEL(
+    lrn, kOpenCL, kFP16, kImageDefault, ocl::LrnImageCompute, ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/lrn_image_compute_test.cc b/lite/kernels/opencl/lrn_image_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9a0fbabbe5f538b09e8ac6e694e96aa512ea6aa3
--- /dev/null
+++ b/lite/kernels/opencl/lrn_image_compute_test.cc
@@ -0,0 +1,270 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <memory>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+
+#define FP16_MAX_DIFF (5e-1)
+
+namespace paddle {
+namespace lite {
+float lrn_square(const float* din,
+                 int c,
+                 int offset,
+                 int channel,
+                 int height,
+                 int width,
+                 int local_size) {
+  int pre_pad = (local_size - 1) / 2;
+  float sum = 0.f;
+  int start = c - pre_pad;
+  int end = c + pre_pad;
+  start = start < 0 ? 0 : start;
+  end = end < channel - 1 ? end : channel - 1;
+  for (int i = start; i <= end; i++) {
+    sum += din[i * height * width] * din[i * height * width];
+  }
+  return sum;
+}
+void lrn_ref(const float* din,
+             const DDim& in_dims,
+             float* output,
+             int local_size,
+             float k,
+             float alpha,
+             float beta,
+             std::string norm_region) {
+  int num = in_dims[0];
+  int channel = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+
+  if (norm_region == "AcrossChannels") {
+    for (int b = 0; b < num; b++) {
+      const float* din_batch = din + b * channel * height * width;
+      float* dout_batch = output + b * channel * height * width;
+      int offset_num = b * channel * height * width;
+      for (int c = 0; c < channel; c++) {
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            int offset_within_channel = h * width + w;
+            int dst_id = c * height * width + offset_within_channel;
+            float square = lrn_square(din_batch,
+                                      c,
+                                      offset_within_channel,
+                                      channel,
+                                      height,
+                                      width,
+                                      local_size);
+            dout_batch[dst_id] =
+                din_batch[dst_id] * pow(k + alpha * square, -beta);
+          }
+        }
+      }
+    }
+  }
+}
+// #define LRN_FP16_LOOP_TEST
+// #define LRN_FP16_PRINT_RESULT
+TEST(lrn_image2d, compute) {
+#ifdef LRN_FP16_LOOP_TEST
+  for (int n = 1; n <= 100; n += 33) {
+    for (auto c : {1, 3, 8, 23, 32}) {
+      for (int h = 12; h <= 100; h += 13) {
+        for (int w = 12; w <= 100; w += 25) {
+          for (auto num : {3, 5, 9}) {
+            for (auto k : {1.0, 1.5}) {
+              for (auto alpha : {1e-4}) {
+                for (auto beta : {0.5, 0.75}) {
+                  for (auto norm_region : {"AcrossChannels"}) {
+#else
+  const int n = 1;
+  const int c = 5;
+  const int h = 2;
+  const int w = 4;
+  const int num = 5;
+  const float k = 1.0;
+  const float alpha = 1e-4;
+  const float beta = 0.75;
+  const std::string norm_region = "AcrossChannels";
+#endif  // GRID_FP16_LOOP_TEST
+
+                    LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " "
+                              << c << " " << h << " " << w << " ========";
+                    LOG(INFO) << "LRN parameters: ";
+                    LOG(INFO) << "num: " << num << ", k: " << k
+                              << ", alpha: " << alpha << ", beta: " << beta
+                              << ", norm_region: " << norm_region;
+                    auto kernels = KernelRegistry::Global().Create(
+                        "lrn",
+                        TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault));
+                    ASSERT_FALSE(kernels.empty());
+                    auto kernel = std::move(kernels.front());
+                    LOG(INFO) << "get kernel:" << kernel->doc();
+
+                    lite::Tensor x, out;
+                    operators::LrnParam param;
+                    param.X = &x;
+                    param.Out = &out;
+                    param.n = num;
+                    param.k = k;
+                    param.alpha = alpha;
+                    param.beta = beta;
+                    param.norm_region = norm_region;
+
+                    std::unique_ptr<KernelContext> context(new KernelContext);
+                    context->As<OpenCLContext>().InitOnce();
+
+                    kernel->SetParam(param);
+                    std::unique_ptr<KernelContext> lrn_context(
+                        new KernelContext);
+                    context->As<OpenCLContext>().CopySharedTo(
+                        &(lrn_context->As<OpenCLContext>()));
+                    kernel->SetContext(std::move(lrn_context));
+
+                    const DDim in_dim =
+                        DDim(std::vector<DDim::value_type>{n, c, h, w});
+                    const DDim out_dim =
+                        DDim(std::vector<DDim::value_type>{n, c, h, w});
+                    x.Resize(in_dim);
+                    out.Resize(out_dim);
+
+                    std::default_random_engine engine;
+                    std::uniform_real_distribution<float> dist(-1, 1);
+                    int sum = n * c * h * w;
+                    std::vector<float> input_v(sum);
+                    for (auto& i : input_v) {
+                      i = dist(engine);
+                    }
+
+                    LOG(INFO) << "prepare input";
+                    CLImageConverterDefault* default_converter =
+                        new CLImageConverterDefault();
+                    DDim x_image_shape =
+                        default_converter->InitImageDimInfoWith(in_dim);
+                    LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
+                              << x_image_shape[1];
+                    std::vector<half_t> x_image_data(
+                        x_image_shape.production() * 4);  // 4 : RGBA
+                    default_converter->NCHWToImage(
+                        input_v.data(), x_image_data.data(), in_dim);
+                    auto* x_image = x.mutable_data<half_t, cl::Image2D>(
+                        x_image_shape[0],
+                        x_image_shape[1],
+                        x_image_data.data());
+                    // LOG(INFO) << "x_image:" << x_image;
+
+                    DDim out_image_shape =
+                        default_converter->InitImageDimInfoWith(out_dim);
+                    LOG(INFO) << "out_image_shape = " << out_image_shape[0]
+                              << " " << out_image_shape[1];
+                    auto* out_image = out.mutable_data<half_t, cl::Image2D>(
+                        out_image_shape[0], out_image_shape[1]);
+                    // LOG(INFO) << "out_image:" << out_image;
+                    kernel->Launch();
+
+                    auto* wait_list =
+                        context->As<OpenCLContext>().cl_wait_list();
+                    auto* out_ptr = param.Out->data<half_t, cl::Image2D>();
+                    auto it = wait_list->find(out_ptr);
+                    if (it != wait_list->end()) {
+                      VLOG(4) << "--- Find the sync event for the target cl "
+                                 "tensor. ---";
+                      auto& event = *(it->second);
+                      event.wait();
+                    } else {
+                      LOG(FATAL) << "Could not find the sync event for the "
+                                    "target cl tensor.";
+                    }
+
+                    std::unique_ptr<float[]> out_ref(
+                        new float[out_dim.production()]);
+                    lrn_ref(input_v.data(),
+                            in_dim,
+                            out_ref.get(),
+                            num,
+                            k,
+                            alpha,
+                            beta,
+                            norm_region);
+
+                    const size_t cl_image2d_row_pitch{0};
+                    const size_t cl_image2d_slice_pitch{0};
+                    half_t* out_image_data =
+                        new half_t[40000];  // out_image_shape.production() *
+                                            // 4];
+                    TargetWrapperCL::ImgcpySync(out_image_data,
+                                                out_image,
+                                                out_image_shape[0],
+                                                out_image_shape[1],
+                                                cl_image2d_row_pitch,
+                                                cl_image2d_slice_pitch,
+                                                IoDirection::DtoH);
+                    float* out_data =
+                        new float[40000];  // out_image_shape.production() * 4];
+                    default_converter->ImageToNCHW(
+                        out_image_data, out_data, out_image_shape, out_dim);
+// result
+#ifdef LRN_FP16_PRINT_RESULT
+                    LOG(INFO)
+                        << "---- print kernel result (input -> output) ----";
+                    for (int eidx = 0; eidx < in_dim.production(); ++eidx) {
+                      std::cout << input_v[eidx] << " -> " << out_data[eidx]
+                                << std::endl;
+                    }
+#endif  // LRN_FP16_PRINT_RESULT
+                    for (int i = 0; i < out_dim.production(); i++) {
+                      auto abs_diff = abs(out_data[i] - out_ref[i]);
+                      auto relative_diff =
+                          COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]);
+                      EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) ||
+                                    (abs_diff <= FP16_MAX_DIFF),
+                                true);
+                      if ((relative_diff > FP16_MAX_DIFF) &&
+                          (abs_diff > FP16_MAX_DIFF)) {
+                        LOG(ERROR) << "error idx: " << i << ", input_v[" << i
+                                   << "]: " << input_v[i] << ",  output_data["
+                                   << i << "]: " << out_data[i] << ", out_ref["
+                                   << i << "]:" << out_ref[i]
+                                   << " abs_diff:" << abs_diff
+                                   << " relative_diff:" << relative_diff
+                                   << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+                      }
+                    }
+#ifdef LRN_FP16_LOOP_TEST
+                  }  // norm_region
+                }    // beta
+              }      // alpha
+            }        // k
+          }          // num
+        }            // w
+      }              // h
+    }                // c
+  }                  // n
+#else
+// nothing to do.
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(lrn, kOpenCL, kFP16, kImageDefault, ImageDefault);
diff --git a/lite/kernels/opencl/mul_compute.cc b/lite/kernels/opencl/mul_buffer_compute.cc
similarity index 98%
rename from lite/kernels/opencl/mul_compute.cc
rename to lite/kernels/opencl/mul_buffer_compute.cc
index 78c91317744b4257ae83d03c9d53ae84ba3ec0fd..4c46da67da9877fb37b214b6d738b3dd3da3e5bb 100644
--- a/lite/kernels/opencl/mul_compute.cc
+++ b/lite/kernels/opencl/mul_buffer_compute.cc
@@ -102,7 +102,7 @@ class MulCompute
  private:
   int m_, n_, k_;
   std::string kernel_func_name_{"mat_mul"};
-  std::string build_options_{"-DCL_DTYPE=float"};
+  std::string build_options_{"-DCL_DTYPE_float"};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/mul_compute_test.cc b/lite/kernels/opencl/mul_buffer_compute_test.cc
similarity index 100%
rename from lite/kernels/opencl/mul_compute_test.cc
rename to lite/kernels/opencl/mul_buffer_compute_test.cc
diff --git a/lite/kernels/opencl/nearest_interp_compute.cc b/lite/kernels/opencl/nearest_interp_compute.cc
deleted file mode 100644
index 22cbd8522f2d4212a8bf991825863503e5a27c46..0000000000000000000000000000000000000000
--- a/lite/kernels/opencl/nearest_interp_compute.cc
+++ /dev/null
@@ -1,239 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/kernels/opencl/image_helper.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/replace_stl/stream.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-
-class NearestInterpComputeFloatImageDefault
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFloat),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::InterpolateParam;
-
-  std::string doc() const override {
-    return "NearestInterp using cl::Image2D(ImageDefault/RGBA), kFloat";
-  }
-
-  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/nearest_interp_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    auto* x_buf = param.X->data<float, cl::Image2D>();
-    auto* out_buf =
-        param.Out->mutable_data<float, cl::Image2D>(param.out_w, param.out_h);
-    const auto& y_dims = param.Out->dims();  // useless: check dim only
-    float scale_h = y_dims[2] / x_dims[2];
-    float scale_w = y_dims[3] / x_dims[3];
-    int in_dims_h = x_dims[2];
-    int out_dims_h = y_dims[2];
-    int in_dims_w = x_dims[3];
-    int out_dims_w = y_dims[3];
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_h));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_w));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_h));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_h));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_w));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_w));
-    CL_CHECK_FATAL(status);
-
-    paddle::lite::CLImageConverterDefault default_convertor;
-    auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dims);  // w, h
-    auto y_img_width = y_img_shape[0];
-    LOG(INFO) << "y_img_width:" << y_img_width;
-
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(y_img_width / y_dims[3]),
-                    static_cast<cl::size_type>(y_dims[3]),
-                    static_cast<cl::size_type>(y_dims[0] * y_dims[2])};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(out_buf, event_);
-    context.cl_context()->GetCommandQueue().finish();
-  }
-
- private:
-  std::string kernel_func_name_{"nearest_interp"};
-  std::string build_options_{"-DCL_DTYPE_float "};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-class NearestInterpComputeFP16ImageDefault
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFP16),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::InterpolateParam;
-
-  std::string doc() const override {
-    return "NearestInterp using cl::Image2D(ImageDefault/RGBA), kFP16";
-  }
-
-  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/nearest_interp_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    auto* x_buf =
-        param.X->data<int16_t,
-                      cl::Image2D>();  // use int16_t represents half float
-    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_buf =
-        param.Out->mutable_data<int16_t, cl::Image2D>(  // use int16_t
-                                                        // represents half float
-            image_shape["width"],
-            image_shape["height"]);
-    const auto& y_dims = param.Out->dims();  // useless: check dim only
-    float scale_h = y_dims[2] / x_dims[2];
-    float scale_w = y_dims[3] / x_dims[3];
-    int in_dims_h = x_dims[2];
-    int out_dims_h = y_dims[2];
-    int in_dims_w = x_dims[3];
-    int out_dims_w = y_dims[3];
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_h));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_w));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_h));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_h));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_w));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_w));
-    CL_CHECK_FATAL(status);
-
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
-    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-            << image_shape["height"];
-    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
-            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
-    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
-            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
-
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
-                    static_cast<cl::size_type>(image_shape["height"])};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(out_buf, event_);
-    context.cl_context()->GetCommandQueue().finish();
-  }
-
- private:
-  std::string kernel_func_name_{"nearest_interp"};
-  std::string build_options_{"-DCL_DTYPE_half"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(
-    nearest_interp,
-    kOpenCL,
-    kFloat,
-    kImageDefault,
-    paddle::lite::kernels::opencl::NearestInterpComputeFloatImageDefault,
-    ImageDefault)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    nearest_interp,
-    kOpenCL,
-    kFP16,
-    kImageDefault,
-    paddle::lite::kernels::opencl::NearestInterpComputeFP16ImageDefault,
-    ImageDefault)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
diff --git a/lite/kernels/opencl/nearest_interp_image_compute.cc b/lite/kernels/opencl/nearest_interp_image_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..082f21ab1ae792ae33e9e2a368073274258b8884
--- /dev/null
+++ b/lite/kernels/opencl/nearest_interp_image_compute.cc
@@ -0,0 +1,147 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class NearestInterpComputeImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::InterpolateParam;
+
+  std::string doc() const override {
+    return "NearestInterp using cl::Image2D(ImageDefault/RGBA), kFP16";
+  }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/nearest_interp_kernel.cl", build_options_);
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    const auto& y_dims = param.Out->dims();
+    auto* x_img =
+        param.X->data<half_t,
+                      cl::Image2D>();  // use half_t represents half float
+    auto out_image_shape = InitImageDimInfoWith(y_dims);
+    auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(  // use half_t
+        // represents half float
+        out_image_shape["width"],
+        out_image_shape["height"]);
+
+    float scale_h = y_dims[2] / x_dims[2];
+    float scale_w = y_dims[3] / x_dims[3];
+    int in_dims_h = x_dims[2];
+    int out_dims_h = y_dims[2];
+    int in_dims_w = x_dims[3];
+    int out_dims_w = y_dims[3];
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_w));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_w));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_w));
+    CL_CHECK_FATAL(status);
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+    VLOG(4) << "out_image_shape(w,h):" << out_image_shape["width"] << " "
+            << out_image_shape["height"];
+    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
+    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+#endif
+
+    const std::vector<size_t>& default_work_size =
+        DefaultWorkSize(y_dims,
+                        DDim(std::vector<DDim::value_type>{
+                            static_cast<int64_t>(out_image_shape["width"]),
+                            static_cast<int64_t>(out_image_shape["height"])}));
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(default_work_size.data()[0]),
+                    static_cast<cl::size_type>(default_work_size.data()[1]),
+                    static_cast<cl::size_type>(default_work_size.data()[2])};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"nearest_interp"};
+  std::string build_options_{" -DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    nearest_interp,
+    kOpenCL,
+    kFP16,
+    kImageDefault,
+    paddle::lite::kernels::opencl::NearestInterpComputeImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/nearest_interp_compute_test.cc b/lite/kernels/opencl/nearest_interp_image_compute_test.cc
similarity index 92%
rename from lite/kernels/opencl/nearest_interp_compute_test.cc
rename to lite/kernels/opencl/nearest_interp_image_compute_test.cc
index fc9c5893eea92684e72f472328d41bfc98ead9fa..a91e853a865f6abb2536606be6628e860cf7d6b9 100644
--- a/lite/kernels/opencl/nearest_interp_compute_test.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute_test.cc
@@ -60,7 +60,7 @@ void nearest_interp_compute_ref(const dtype *src,
 }
 // #define LOOP_TEST
 // #define PRINT_RESULT
-TEST(nearest_interp_image2d_fp32, compute) {
+TEST(nearest_interp_image2d, compute) {
   LOG(INFO) << "main steps of test: host -> layout(buf2img) -> "
                "nearest_interp(img) -> "
                "layout(img2buf) "
@@ -105,7 +105,7 @@ TEST(nearest_interp_image2d_fp32, compute) {
               auto nearest_interp_img_kernels =
                   KernelRegistry::Global().Create("nearest_interp",
                                                   TARGET(kOpenCL),
-                                                  PRECISION(kFloat),
+                                                  PRECISION(kFP16),
                                                   DATALAYOUT(kImageDefault));
               ASSERT_FALSE(buf_to_img_kernels.empty());
               ASSERT_FALSE(buf_to_img_kernels.empty());
@@ -166,12 +166,12 @@ TEST(nearest_interp_image2d_fp32, compute) {
                 mapped_y[i] = static_cast<int>(0);
               }
               auto *nearest_interp_in_data =
-                  nearest_interp_in.mutable_data<float, cl::Image2D>(
+                  nearest_interp_in.mutable_data<half_t, cl::Image2D>(
                       nearest_interp_image2d_shape["width"],
                       nearest_interp_image2d_shape["height"]);
               auto *nearest_interp_out_data =
-                  nearest_interp_out.mutable_data<float, cl::Image2D>(y_dim[3],
-                                                                      y_dim[2]);
+                  nearest_interp_out.mutable_data<half_t, cl::Image2D>(
+                      y_dim[3], y_dim[2]);
 
               // set context and kernel args
               LOG(INFO) << "set context and kernel args";
@@ -208,6 +208,21 @@ TEST(nearest_interp_image2d_fp32, compute) {
               LOG(INFO) << "run kernel: img_to_buf_kernel";
               img_to_buf_kernel->Launch();
 
+              // wait for opencl
+              auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
+              auto *out_ptr = ImageToBufferParam.y->data<float, cl::Buffer>();
+              auto it = wait_list->find(out_ptr);
+
+              if (it != wait_list->end()) {
+                VLOG(4) << "--- Find the sync event for the target cl "
+                           "tensor. ---";
+                auto &event = *(it->second);
+                event.wait();
+              } else {
+                LOG(FATAL) << "Could not find the sync event for the target "
+                              "cl tensor.";
+              }
+
               // compute ref cpu
               for (int nid = 0; nid < x_dim[0]; ++nid) {
                 for (int cid = 0; cid < x_dim[1]; ++cid) {
@@ -273,13 +288,9 @@ TEST(nearest_interp_image2d_fp32, compute) {
 }  // namespace lite
 }  // namespace paddle
 
-// nearest_interp buffer
-// USE_LITE_KERNEL(nearest_interp, kOpenCL, kFloat, kNCHW, def);
-
 // nearest_interp image2d fp32
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
 USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
-USE_LITE_KERNEL(nearest_interp, kOpenCL, kFloat, kImageDefault, ImageDefault);
 
 // nearest_interp image2d fp16
 USE_LITE_KERNEL(nearest_interp, kOpenCL, kFP16, kImageDefault, ImageDefault);
diff --git a/lite/kernels/opencl/pad2d_image_compute.cc b/lite/kernels/opencl/pad2d_image_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1be4729ee1b24ac77383de4d7c111e9d37d29d6b
--- /dev/null
+++ b/lite/kernels/opencl/pad2d_image_compute.cc
@@ -0,0 +1,181 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/logging.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::Pad2dParam;
+
+  std::string doc() const override {
+    return "Pad2d using cl::Image2D(ImageDefault/RGBA), kFP16";
+  }
+
+  void PrepareForRun() override {
+    pad2d_param_ = param_.get_mutable<param_t>();
+
+    if (pad2d_param_->mode == "constant") {
+      kernel_func_name_ = "pad2d_constant";
+    } else if (pad2d_param_->mode == "reflect") {
+      kernel_func_name_ = "pad2d_reflect";
+    } else if (pad2d_param_->mode == "edge") {
+      kernel_func_name_ = "pad2d_edge";
+    } else {
+      LOG(FATAL) << "Unknown mode type";
+    }
+
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/pad2d_kernel.cl", build_options_);
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+  }
+
+  void Run() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+
+    auto* x = pad2d_param_->X;
+    auto* out = pad2d_param_->Out;
+    auto out_dims = out->dims();
+    auto in_dims = x->dims();
+
+    int in_h = in_dims[2];
+    int in_w = in_dims[3];
+    int out_h = out_dims[2];
+    int out_w = out_dims[3];
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "x->target():" << TargetToStr(x->target());
+    VLOG(4) << "out->target():" << TargetToStr(out->target());
+    VLOG(4) << "x->dims():" << in_dims;
+    VLOG(4) << "out->dims():" << out_dims;
+#endif
+
+    auto out_image_shape = InitImageDimInfoWith(out_dims);
+    auto* x_img = x->data<half_t, cl::Image2D>();
+
+    auto* out_img = out->mutable_data<half_t, cl::Image2D>(
+        out_image_shape["width"], out_image_shape["height"]);
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "out_image_shape[w,h]: " << out_image_shape["width"] << " "
+            << out_image_shape["height"];
+
+    VLOG(4) << "in_h: " << in_h << ", in_w: " << in_w;
+    VLOG(4) << "out_h: " << out_h << ", out_w: " << out_w;
+#endif
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    auto default_work_size =
+        DefaultWorkSize(out_dims,
+                        DDim(std::vector<DDim::value_type>{
+                            static_cast<int64_t>(out_image_shape["width"]),
+                            static_cast<int64_t>(out_image_shape["height"])}));
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
+            << default_work_size[1] << ", " << default_work_size[2];
+#endif
+    int pad_h0 = pad2d_param_->paddings[0];
+    int pad_h1 = pad2d_param_->paddings[1];
+    int pad_w0 = pad2d_param_->paddings[2];
+    int pad_w1 = pad2d_param_->paddings[3];
+    float pad_value = pad2d_param_->pad_value;
+
+    cl_int status = kernel.setArg(arg_idx++, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, in_h);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, in_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, out_h);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, out_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, pad_h0);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, pad_h1);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, pad_w0);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, pad_w1);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(arg_idx++, pad_value);
+    CL_CHECK_FATAL(status);
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
+                    static_cast<cl::size_type>(default_work_size[1]),
+                    static_cast<cl::size_type>(default_work_size[2])};
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
+            << global_work_size[1] << " " << global_work_size[2];
+#endif
+  }
+
+ protected:
+  param_t* pad2d_param_{nullptr};
+  std::string kernel_func_name_{};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+namespace ocl = paddle::lite::kernels::opencl;
+REGISTER_LITE_KERNEL(
+    pad2d, kOpenCL, kFP16, kImageDefault, ocl::Pad2dCompute, ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/pad2d_image_compute_test.cc b/lite/kernels/opencl/pad2d_image_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c2371d07f31caf569cfe4b299bf2f88373eb3b9f
--- /dev/null
+++ b/lite/kernels/opencl/pad2d_image_compute_test.cc
@@ -0,0 +1,351 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
+
+namespace paddle {
+namespace lite {
+
+void pad2d_ref(const float *x_data,
+               Tensor *y,
+               std::string mode,
+               int pad_h0,
+               int pad_h1,
+               int pad_w0,
+               int pad_w1,
+               float pad_value) {
+  auto *out_data = y->mutable_data<float>();
+  auto output_dims = y->dims();
+  int n = output_dims[0];
+  int c = output_dims[1];
+  int h = output_dims[2];
+  int w = output_dims[3];
+  int pad_mode;
+  if (mode == "constant") {
+    pad_mode = 0;
+  } else if (mode == "reflect") {
+    pad_mode = 2;
+  } else if (mode == "edge") {
+    pad_mode = 1;
+  } else {
+    LOG(FATAL) << "Unknown mode type";
+  }
+  int in_w = w - pad_w0 - pad_w1;
+  int in_h = h - pad_h0 - pad_h1;
+  int spatial_size_out = w * h;
+  int spatial_size_in = in_w * in_h;
+#pragma omp parallel for
+  for (int i = 0; i < n * c; ++i) {
+    const float *din_batch = x_data + i * spatial_size_in;
+    float *dout_batch = out_data + i * spatial_size_out;
+    int in_y = 0;
+    int in_x = 0;
+    for (int y = 0; y < h; ++y) {
+      for (int x = 0; x < w; ++x) {
+        switch (pad_mode) {
+          case 0:
+            in_y = y - pad_h0;
+            in_x = x - pad_w0;
+            dout_batch[y * w + x] =
+                (in_x >= 0 && in_x < in_w) && (in_y >= 0 && in_y < in_h)
+                    ? din_batch[in_y * in_w + in_x]
+                    : pad_value;
+            break;
+          case 1:
+            in_x = std::min(std::max(pad_w0, x), in_w + pad_w0 - 1) - pad_w0;
+            in_y = std::min(std::max(pad_h0, y), in_h + pad_h0 - 1) - pad_h0;
+            dout_batch[y * w + x] = din_batch[in_y * in_w + in_x];
+            break;
+          case 2:
+            in_y = y - pad_h0;
+            in_x = x - pad_w0;
+            in_y = std::max(in_y, -in_y);
+            in_y = std::min(in_y, 2 * in_h - in_y - 2);
+            in_x = std::max(in_x, -in_x);
+            in_x = std::min(in_x, 2 * in_w - in_x - 2);
+            dout_batch[y * w + x] = din_batch[in_y * in_w + in_x];
+            break;
+          default:
+            LOG(ERROR) << "ERROR: unknown pad mode:" << pad_mode;
+        }
+      }
+    }
+  }
+}
+
+// #define LOOP_TEST
+// #define PRINT_RESULT
+TEST(pad2d_image2d, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> "
+               "pad2d(img) -> "
+               "layout(img2buf) "
+               "-> host";
+
+#ifdef LOOP_TEST
+  for (int n : {1, 3}) {
+    for (auto c : {1, 3}) {
+      for (int h : {12, 112}) {
+        for (int w : {12, 112}) {
+          for (int pad_h0 : {0, 1, 2}) {
+            for (int pad_h1 : {0, 1, 2}) {
+              for (int pad_w0 : {0, 1, 2}) {
+                for (int pad_w1 : {0, 1, 2}) {
+                  for (float pad_value : {10.f}) {
+                    for (std::string pad_mode :
+                         {"constant", "reflect", "edge"}) {
+#else
+  const int n = 1;
+  const int c = 3;
+  const int h = 12;
+  const int w = 112;
+  const int pad_h0 = 1;
+  const int pad_h1 = 2;
+  const int pad_w0 = 1;
+  const int pad_w1 = 2;
+  const float pad_value = 10.f;
+  std::string pad_mode = "reflect";
+#endif  // LOOP_TEST
+
+                      LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " "
+                                << c << " " << h << " " << w;
+                      LOG(INFO) << "======== pad_h0: " << pad_h0
+                                << ", pad_h1: " << pad_h1
+                                << ", pad_w0: " << pad_w0
+                                << ", pad_w1: " << pad_w1
+                                << ",  pad_value: " << pad_value
+                                << ", pad_mode: " << pad_mode;
+                      // set layout kernels
+                      auto buf_to_img_kernels = KernelRegistry::Global().Create(
+                          "layout",
+                          TARGET(kOpenCL),
+                          PRECISION(kAny),
+                          DATALAYOUT(kImageDefault));
+                      auto img_to_buf_kernels =
+                          KernelRegistry::Global().Create("layout",
+                                                          TARGET(kOpenCL),
+                                                          PRECISION(kAny),
+                                                          DATALAYOUT(kNCHW));
+                      auto pad2d_img_kernels = KernelRegistry::Global().Create(
+                          "pad2d",
+                          TARGET(kOpenCL),
+                          PRECISION(kFP16),
+                          DATALAYOUT(kImageDefault));
+                      ASSERT_FALSE(buf_to_img_kernels.empty());
+                      ASSERT_FALSE(buf_to_img_kernels.empty());
+                      ASSERT_FALSE(pad2d_img_kernels.empty());
+
+                      auto buf_to_img_kernel =
+                          std::move(buf_to_img_kernels.front());
+                      auto img_to_buf_kernel =
+                          std::move(img_to_buf_kernels.front());
+                      auto pad2d_img_kernel =
+                          std::move(pad2d_img_kernels.front());
+                      LOG(INFO) << "get 1st kernel: "
+                                << buf_to_img_kernel->doc();
+                      LOG(INFO) << "get 2nd kernel: "
+                                << img_to_buf_kernel->doc();
+                      LOG(INFO) << "get 3rd kernel: "
+                                << pad2d_img_kernel->doc();
+
+                      // set tensors about op param
+                      LOG(INFO) << "set tensors about op param";
+                      // layout(buf->img): x -> pad2d_in
+                      // pad2d(img): pad2d_in -> pad2d_out
+                      // layout(img->buf): pad2d_out -> y
+                      lite::Tensor x, y, pad2d_in, pad2d_out, y_ref;
+                      operators::LayoutParam BufferToImageParam;
+                      operators::LayoutParam ImageToBufferParam;
+                      BufferToImageParam.x = &x;
+                      BufferToImageParam.y = &pad2d_in;
+                      ImageToBufferParam.x = &pad2d_out;
+                      ImageToBufferParam.y = &y;
+                      operators::Pad2dParam Pad2dParam;
+                      Pad2dParam.X = &pad2d_in;
+                      Pad2dParam.Out = &pad2d_out;
+                      Pad2dParam.paddings = {pad_h0, pad_h1, pad_w0, pad_w1};
+                      Pad2dParam.pad_value = pad_value;
+                      Pad2dParam.mode = pad_mode;
+
+                      int64_t out_h = h + pad_h0 + pad_h1;
+                      int64_t out_w = w + pad_w0 + pad_w1;
+                      const DDim x_dim =
+                          DDim(std::vector<DDim::value_type>{n, c, h, w});
+                      const DDim y_dim = DDim(
+                          std::vector<DDim::value_type>{n, c, out_h, out_w});
+                      x.Resize(x_dim);
+                      y.Resize(y_dim);
+                      pad2d_in.Resize(x_dim);
+                      pad2d_out.Resize(y_dim);
+                      y_ref.Resize(y_dim);
+                      auto pad2d_image2d_shape =
+                          paddle::lite::kernels::opencl::InitImageDimInfoWith(
+                              x_dim);
+
+                      // initialize tensors
+                      LOG(INFO) << "initialize tensors";
+                      auto *x_data =
+                          x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+                      auto *y_data =
+                          y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+                      auto *y_data_ref =
+                          y_ref.mutable_data<float>(TARGET(kARM));
+                      auto *mapped_x =
+                          static_cast<float *>(TargetWrapperCL::Map(
+                              x_data, 0, sizeof(float) * x_dim.production()));
+                      auto *mapped_y =
+                          static_cast<float *>(TargetWrapperCL::Map(
+                              y_data, 0, sizeof(float) * y_dim.production()));
+                      std::default_random_engine engine;
+                      std::uniform_real_distribution<float> dist(-1, 1);
+                      for (int i = 0; i < x_dim.production(); ++i) {
+                        mapped_x[i] = dist(engine);
+                      }
+                      auto *pad2d_in_data =
+                          pad2d_in.mutable_data<half_t, cl::Image2D>(
+                              pad2d_image2d_shape["width"],
+                              pad2d_image2d_shape["height"]);
+                      auto *pad2d_out_data =
+                          pad2d_out.mutable_data<half_t, cl::Image2D>(y_dim[3],
+                                                                      y_dim[2]);
+
+                      // set context and kernel args
+                      LOG(INFO) << "set context and kernel args";
+                      std::unique_ptr<KernelContext> context(new KernelContext);
+                      context->As<OpenCLContext>().InitOnce();
+
+                      buf_to_img_kernel->SetParam(BufferToImageParam);
+                      std::unique_ptr<KernelContext> buf_to_img_context(
+                          new KernelContext);
+                      context->As<OpenCLContext>().CopySharedTo(
+                          &(buf_to_img_context->As<OpenCLContext>()));
+                      buf_to_img_kernel->SetContext(
+                          std::move(buf_to_img_context));
+
+                      img_to_buf_kernel->SetParam(ImageToBufferParam);
+                      std::unique_ptr<KernelContext> img_to_buf_context(
+                          new KernelContext);
+                      context->As<OpenCLContext>().CopySharedTo(
+                          &(img_to_buf_context->As<OpenCLContext>()));
+                      img_to_buf_kernel->SetContext(
+                          std::move(img_to_buf_context));
+
+                      pad2d_img_kernel->SetParam(Pad2dParam);
+                      std::unique_ptr<KernelContext> pad2d_img_context(
+                          new KernelContext);
+                      context->As<OpenCLContext>().CopySharedTo(
+                          &(pad2d_img_context->As<OpenCLContext>()));
+                      pad2d_img_kernel->SetContext(
+                          std::move(pad2d_img_context));
+
+                      // run kernels
+                      LOG(INFO) << "run kernel: buf_to_img_kernel";
+                      buf_to_img_kernel->Launch();
+                      LOG(INFO) << "run kernel: pad2d_img_kernel";
+                      pad2d_img_kernel->Launch();
+                      LOG(INFO) << "run kernel: img_to_buf_kernel";
+                      img_to_buf_kernel->Launch();
+
+                      // wait for opencl
+                      auto *wait_list =
+                          context->As<OpenCLContext>().cl_wait_list();
+                      auto *out_ptr =
+                          ImageToBufferParam.y->data<float, cl::Buffer>();
+                      auto it = wait_list->find(out_ptr);
+
+                      if (it != wait_list->end()) {
+                        VLOG(4) << "--- Find the sync event for the target cl "
+                                   "tensor. ---";
+                        auto &event = *(it->second);
+                        event.wait();
+                      } else {
+                        LOG(FATAL)
+                            << "Could not find the sync event for the target "
+                               "cl tensor.";
+                      }
+
+                      // compute ref cpu
+                      pad2d_ref(mapped_x,
+                                &y_ref,
+                                pad_mode,
+                                pad_h0,
+                                pad_h1,
+                                pad_w0,
+                                pad_w1,
+                                pad_value);
+// result
+#ifdef PRINT_RESULT
+                      LOG(INFO)
+                          << "---- print kernel result (input -> output) ----";
+                      for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+                        std::cout << mapped_x[eidx] << " ";
+                      }
+                      std::cout << std::endl;
+                      for (int eidx = 0; eidx < y_dim.production(); ++eidx) {
+                        std::cout << mapped_y[eidx] << " ";
+                      }
+                      std::cout << std::endl;
+                      for (int eidx = 0; eidx < y_dim.production(); ++eidx) {
+                        std::cout << y_data_ref[eidx] << " ";
+                      }
+                      std::cout << std::endl;
+#endif  // PRINT_RESULT
+                      // check result: compare kernel output and cpu
+                      // output(y_data_ref)
+                      for (int eidx = 0; eidx < y_dim.production(); eidx++) {
+                        EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-3);
+                        if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-3) {
+                          LOG(FATAL) << "1st diff in this case at eidx[from 0]:"
+                                     << eidx << " / " << y_dim.production()
+                                     << ", y_data_ref[" << eidx
+                                     << "]:" << y_data_ref[eidx]
+                                     << ", mapped_y[" << eidx
+                                     << "]:" << mapped_y[eidx];
+                          break;
+                        }
+                      }
+
+                      // free
+                      LOG(INFO) << "free: unmap x, y";
+                      TargetWrapperCL::Unmap(x_data, mapped_x);
+                      TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef LOOP_TEST
+                    }  // pad_mode
+                  }    // pad_value
+                }      // pad_w1
+              }        // pad_w0
+            }          // pad_h1
+          }            // pad_h0
+        }              // w
+      }                // h
+    }                  // c
+  }                    // n
+#else
+// nothing to do.
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+// pad2d image2d fp32
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
+
+// pad image2d fp16
+USE_LITE_KERNEL(pad2d, kOpenCL, kFP16, kImageDefault, ImageDefault);
diff --git a/lite/kernels/opencl/pool_buffer_compute.cc b/lite/kernels/opencl/pool_buffer_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f491afb86d4e4d5144522b6fb028c225c9a97e4
--- /dev/null
+++ b/lite/kernels/opencl/pool_buffer_compute.cc
@@ -0,0 +1,136 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+#include "lite/utils/string.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class PoolCompute
+    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::PoolParam;
+
+  std::string doc() const override { return "Pool using cl::Buffer, kFloat"; }
+
+  void PrepareForRun() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    kernel_func_name_ += param.pooling_type;
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "buffer/pool_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    const auto& in_dims = param.x->dims();
+    const auto& out_dims = param.output->dims();
+    const std::string pooling_type = param.pooling_type;
+    const bool global_pooling = param.global_pooling;
+    std::vector<int> paddings = *param.paddings;
+    std::vector<int> strides = param.strides;
+    std::vector<int> ksize = param.ksize;
+    if (global_pooling) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[2 * i] = 0;
+        paddings[2 * i + 1] = 0;
+        ksize[i] = static_cast<int>(in_dims[i + 2]);
+      }
+    }
+    bool pads_equal =
+        (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
+    if (!pads_equal) {
+      LOG(FATAL)
+          << "padding requires pad_left == pad_right, pad_top == pad_bottom";
+    }
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    auto* input_buf = param.x->data<float, cl::Buffer>();
+    auto* output_buf =
+        param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    cl_int status;
+    auto numel = out_dims.production();
+    int arg_idx = 0;
+    status = kernel.setArg(arg_idx, static_cast<const int>(numel));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *input_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[1]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[3]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[3]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[1]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[1]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *output_buf);
+    CL_CHECK_FATAL(status);
+    auto global_work_size = cl::NDRange(static_cast<size_t>(numel));
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(output_buf, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"pool_"};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(pool2d,
+                     kOpenCL,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::opencl::PoolCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .Finalize();
diff --git a/lite/kernels/opencl/pool_buffer_compute_test.cc b/lite/kernels/opencl/pool_buffer_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff24477cb9276568679f685164ac74c2cfaae603
--- /dev/null
+++ b/lite/kernels/opencl/pool_buffer_compute_test.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <memory>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+
+void pool_avg(const int padding_height,
+              const int padding_width,
+              const int stride_height,
+              const int stride_width,
+              const int ksize_height,
+              const int ksize_width,
+              const float* input_data,
+              const DDim& in_dim,
+              float* output_data,
+              const DDim& out_dim) {
+  const int batch_size = in_dim[0];
+  const int input_height = in_dim[2];
+  const int input_width = in_dim[3];
+  const int output_channels = out_dim[1];
+  const int output_height = out_dim[2];
+  const int output_width = out_dim[3];
+
+  const size_t input_spatial_size = input_height * input_width;
+  const size_t output_spatial_size = output_height * output_width;
+
+  for (int i = 0; i < batch_size; i++) {
+    for (int c = 0; c < output_channels; ++c) {
+      int channel = i * output_channels + c;
+      const float* input_ptr = input_data + channel * input_spatial_size;
+      float* output_ptr = output_data + channel * output_spatial_size;
+
+      for (int ph = 0; ph < output_height; ++ph) {
+        int hstart = ph * stride_height - padding_height;
+        int hend = std::min(hstart + ksize_height, input_height);
+        hstart = std::max(hstart, 0);
+        for (int pw = 0; pw < output_width; ++pw) {
+          int wstart = pw * stride_width - padding_width;
+          int wend = std::min(wstart + ksize_width, input_width);
+          wstart = std::max(wstart, 0);
+
+          float val = 0.f;
+          int count = 0;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              val += input_ptr[h * input_width + w];
+              ++count;
+            }
+          }
+          output_ptr[ph * output_width + pw] =
+              (count > 0) ? val * (1.f / count) : 0.f;
+        }
+      }
+    }
+  }
+}
+
+TEST(pool2d_buffer_fp32, compute) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
+  ASSERT_FALSE(kernels.empty());
+
+  auto kernel = std::move(kernels.front());
+  LOG(INFO) << "get kernel:" << kernel->doc();
+
+  lite::Tensor x, out;
+  operators::PoolParam param;
+  param.x = &x;
+  param.output = &out;
+  param.global_pooling = true;
+  param.pooling_type = "avg";
+  std::vector<int> paddings = {0, 0, 0, 0};
+  param.strides = std::vector<int>{1, 1};
+  param.ksize = std::vector<int>{7, 7};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> pool_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(pool_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(pool_context));
+
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 1024, 7, 7});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 1024, 1, 1});
+  x.Resize(in_dim);
+  out.Resize(out_dim);
+
+  auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-5, 5);
+  auto* mapped_x = static_cast<float*>(
+      TargetWrapperCL::Map(x_data, 0, sizeof(float) * in_dim.production()));
+  for (int i = 0; i < in_dim.production(); i++) {
+    mapped_x[i] = dist(engine);
+  }
+
+  kernel->Launch();
+
+  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto* out_ptr = param.output->data<float, cl::Buffer>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto& event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
+  pool_avg(0, 0, 1, 1, 7, 7, mapped_x, in_dim, out_ref.get(), out_dim);
+  TargetWrapperCL::Unmap(x_data, mapped_x);
+  auto* out_data = out.mutable_data<float, cl::Buffer>();
+  auto* mapped_out = static_cast<float*>(
+      TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
+  for (int i = 0; i < out_dim.production(); i++) {
+    EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
+  }
+  TargetWrapperCL::Unmap(out_data, mapped_out);
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def);
diff --git a/lite/kernels/opencl/pool_compute.cc b/lite/kernels/opencl/pool_image_compute.cc
similarity index 52%
rename from lite/kernels/opencl/pool_compute.cc
rename to lite/kernels/opencl/pool_image_compute.cc
index c0a00e87b8ad67ba0028ff4fa57f0811d52c1f0a..39da325ebb10c85f153e349173aa833bbf5e1f6e 100644
--- a/lite/kernels/opencl/pool_compute.cc
+++ b/lite/kernels/opencl/pool_image_compute.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include <vector>
+
+#include "lite/backends/opencl/cl_half.h"
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
@@ -26,111 +28,23 @@ namespace lite {
 namespace kernels {
 namespace opencl {
 
-class PoolCompute
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
+                                             PRECISION(kFP16),
+                                             DATALAYOUT(kImageDefault)> {
  public:
   using param_t = operators::PoolParam;
 
-  std::string doc() const override { return "Pool using cl::Buffer, kFloat"; }
+  std::string doc() const override { return "Pool using cl::Image2D, kFP16"; }
 
   void PrepareForRun() override {
     const auto& param = *param_.get_mutable<param_t>();
-    kernel_func_name_ += param.pooling_type;
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/pool_kernel.cl", build_options_);
-  }
 
-  void Run() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    const auto& in_dims = param.x->dims();
-    const auto& out_dims = param.output->dims();
-    const std::string pooling_type = param.pooling_type;
+    kernel_func_name_ += param.pooling_type;
     const bool global_pooling = param.global_pooling;
-    std::vector<int> paddings = *param.paddings;
-    std::vector<int> strides = param.strides;
-    std::vector<int> ksize = param.ksize;
     if (global_pooling) {
-      for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[2 * i] = 0;
-        paddings[2 * i + 1] = 0;
-        ksize[i] = static_cast<int>(in_dims[i + 2]);
-      }
-    }
-    bool pads_equal =
-        (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
-    if (!pads_equal) {
-      LOG(FATAL)
-          << "padding requires pad_left == pad_right, pad_top == pad_bottom";
+      kernel_func_name_ += "_global";
     }
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    auto* input_buf = param.x->data<float, cl::Buffer>();
-    auto* output_buf =
-        param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    cl_int status;
-    auto numel = out_dims.production();
-    int arg_idx = 0;
-    status = kernel.setArg(arg_idx, static_cast<const int>(numel));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *input_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[1]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[2]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[3]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[2]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[3]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[1]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[1]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[2]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *output_buf);
-    CL_CHECK_FATAL(status);
-    auto global_work_size = cl::NDRange(static_cast<size_t>(numel));
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(output_buf, event_);
-  }
-
- private:
-  std::string kernel_func_name_{"pool_"};
-  std::string build_options_{"-DCL_DTYPE_float"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
-                                             PRECISION(kFloat),
-                                             DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::PoolParam;
-
-  std::string doc() const override { return "Pool using cl::Image2D, kFloat"; }
-
-  void PrepareForRun() override {
-    const auto& param = *param_.get_mutable<param_t>();
-    kernel_func_name_ += param.pooling_type;
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
     auto& context = ctx_->As<OpenCLContext>();
     context.cl_context()->AddKernel(
         kernel_func_name_, "image/pool_kernel.cl", build_options_);
@@ -145,6 +59,14 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
     std::vector<int> paddings = *param.paddings;
     std::vector<int> strides = param.strides;
     std::vector<int> ksize = param.ksize;
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "global_pooling: " << global_pooling;
+    VLOG(4) << "pooling_type: " << pooling_type;
+    VLOG(4) << "paddings : " << paddings[0] << "  " << paddings[1] << "  "
+            << paddings[2] << "  " << paddings[3] << "  ";
+#endif
+
     if (global_pooling) {
       for (size_t i = 0; i < ksize.size(); ++i) {
         paddings[2 * i] = 0;
@@ -152,6 +74,22 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
         ksize[i] = static_cast<int>(in_dims[i + 2]);
       }
     }
+
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "in_dims : [" << in_dims.size() << "]" << in_dims[0] << "  "
+            << in_dims[1] << "  " << in_dims[2] << "  " << in_dims[3];
+    VLOG(4) << "out_dims : [" << out_dims.size() << "]" << out_dims[0] << "  "
+            << out_dims[1] << "  " << out_dims[2] << "  " << out_dims[3];
+    VLOG(4) << "paddings fixed : " << paddings[0] << "  " << paddings[1] << "  "
+            << paddings[2] << "  " << paddings[3] << "  ";
+    VLOG(4) << "strides : [" << strides.size() << "]" << strides[0] << "  "
+            << strides[1];
+    VLOG(4) << "ksize : [" << ksize.size() << "]" << ksize[0] << "  "
+            << ksize[1] << "  " << ksize[2] << "  " << ksize[3];
+    VLOG(4) << "paddings : [" << paddings.size() << "]" << paddings[0] << "  "
+            << paddings[1] << "  " << paddings[2] << "  " << paddings[3];
+#endif
+
     bool pads_equal =
         (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
     if (!pads_equal) {
@@ -161,15 +99,17 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
 
-    auto* x_img = param.x->data<float, cl::Image2D>();
-    LOG(INFO) << "x_image" << x_img;
+    auto* x_img = param.x->data<half_t, cl::Image2D>();
+    //    VLOG(4) << "x_image" << x_img;
 
     auto out_image_shape = InitImageDimInfoWith(out_dims);
-    LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " "
-              << out_image_shape["height"];
-    auto* out_img = param.output->mutable_data<float, cl::Image2D>(
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
+            << out_image_shape["height"];
+#endif
+    auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
         out_image_shape["width"], out_image_shape["height"]);
-    LOG(INFO) << "out_image" << out_img;
+    //    VLOG(4) << "out_image" << out_img;
 
     STL::stringstream kernel_key;
     kernel_key << kernel_func_name_ << build_options_;
@@ -179,7 +119,10 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
     int w = out_dims[3];
     int nh = out_dims[0] * out_dims[2];
     auto global_work_size = cl::NDRange(c_block, w, nh);
-
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "global_work_size : [" << 3 << "]" << c_block << "  " << w
+            << "  " << nh << "  ";
+#endif
     cl_int status;
     int arg_idx = 0;
     status = kernel.setArg(arg_idx, *x_img);
@@ -220,7 +163,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
 
  private:
   std::string kernel_func_name_{"pool_"};
-  std::string build_options_{"-DCL_DTYPE_float"};
+  std::string build_options_{"-DCL_DTYPE_half"};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
@@ -229,28 +172,18 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
 }  // namespace lite
 }  // namespace paddle
 
-// REGISTER_LITE_KERNEL(pool2d,
-//                      kOpenCL,
-//                      kFloat,
-//                      kNCHW,
-//                      paddle::lite::kernels::opencl::PoolCompute,
-//                      def)
-//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//     .Finalize();
-
 REGISTER_LITE_KERNEL(pool2d,
                      kOpenCL,
-                     kFloat,
+                     kFP16,
                      kImageDefault,
                      paddle::lite::kernels::opencl::PoolComputeImage2D,
                      image2d)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
                                        DATALAYOUT(kImageDefault))})
     .Finalize();
diff --git a/lite/kernels/opencl/pool_compute_test.cc b/lite/kernels/opencl/pool_image_compute_test.cc
similarity index 64%
rename from lite/kernels/opencl/pool_compute_test.cc
rename to lite/kernels/opencl/pool_image_compute_test.cc
index 133fc619205c0fcd0fdfcd1203796e1e74e0c4e0..52aa93d9fddac2c59e1b2bf1c149fb8949e8efa9 100644
--- a/lite/kernels/opencl/pool_compute_test.cc
+++ b/lite/kernels/opencl/pool_image_compute_test.cc
@@ -18,6 +18,9 @@
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+
+#define FP16_MAX_DIFF (5e-1)
 
 namespace paddle {
 namespace lite {
@@ -73,82 +76,10 @@ void pool_avg(const int padding_height,
   }
 }
 
-// buffer
-#if 0   // pool_buffer
-TEST(pool2d_buffer_fp32, compute) {
+TEST(pool2d_image2d, compute) {
   LOG(INFO) << "to get kernel ...";
   auto kernels = KernelRegistry::Global().Create(
-      "pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-
-  auto kernel = std::move(kernels.front());
-  LOG(INFO) << "get kernel:" << kernel->doc();
-
-  lite::Tensor x, out;
-  operators::PoolParam param;
-  param.x = &x;
-  param.output = &out;
-  param.global_pooling = true;
-  param.pooling_type = "avg";
-  std::vector<int> paddings = {0, 0, 0, 0};
-  param.strides = std::vector<int>{1, 1};
-  param.ksize = std::vector<int>{7, 7};
-  param.paddings = std::make_shared<std::vector<int>>(paddings);
-
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-
-  kernel->SetParam(param);
-  std::unique_ptr<KernelContext> pool_context(new KernelContext);
-  context->As<OpenCLContext>().CopySharedTo(
-      &(pool_context->As<OpenCLContext>()));
-  kernel->SetContext(std::move(pool_context));
-
-  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 1024, 7, 7});
-  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 1024, 1, 1});
-  x.Resize(in_dim);
-  out.Resize(out_dim);
-
-  auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-5, 5);
-  auto* mapped_x = static_cast<float*>(
-      TargetWrapperCL::Map(x_data, 0, sizeof(float) * in_dim.production()));
-  for (int i = 0; i < in_dim.production(); i++) {
-    mapped_x[i] = dist(engine);
-  }
-
-  kernel->Launch();
-
-  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto& event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
-
-  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
-  pool_avg(0, 0, 1, 1, 7, 7, mapped_x, in_dim, out_ref.get(), out_dim);
-  TargetWrapperCL::Unmap(x_data, mapped_x);
-  auto* out_data = out.mutable_data<float, cl::Buffer>();
-  auto* mapped_out = static_cast<float*>(
-      TargetWrapperCL::Map(out_data, 0, sizeof(float) * out_dim.production()));
-  for (int i = 0; i < out_dim.production(); i++) {
-    EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
-  }
-  TargetWrapperCL::Unmap(out_data, mapped_out);
-}
-#endif  // pool_buffer
-
-TEST(pool2d_image2d_fp32, compute) {
-  LOG(INFO) << "to get kernel ...";
-  auto kernels = KernelRegistry::Global().Create(
-      "pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault));
+      "pool2d", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
   ASSERT_FALSE(kernels.empty());
 
   auto kernel = std::move(kernels.front());
@@ -192,22 +123,22 @@ TEST(pool2d_image2d_fp32, compute) {
   DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
   LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
             << x_image_shape[1];
-  std::vector<float> x_image_data(x_image_shape.production() * 4);  // 4 : RGBA
+  std::vector<half_t> x_image_data(x_image_shape.production() * 4);  // 4 : RGBA
   default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
-  auto* x_image = x.mutable_data<float, cl::Image2D>(
+  auto* x_image = x.mutable_data<half_t, cl::Image2D>(
       x_image_shape[0], x_image_shape[1], x_image_data.data());
   LOG(INFO) << "x_image:" << x_image;
 
   DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim);
   LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
             << out_image_shape[1];
-  auto* out_image = out.mutable_data<float, cl::Image2D>(out_image_shape[0],
-                                                         out_image_shape[1]);
+  auto* out_image = out.mutable_data<half_t, cl::Image2D>(out_image_shape[0],
+                                                          out_image_shape[1]);
   LOG(INFO) << "out_image:" << out_image;
   kernel->Launch();
 
   auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<float, cl::Image2D>();
+  auto* out_ptr = param.output->data<half_t, cl::Image2D>();
   auto it = wait_list->find(out_ptr);
   if (it != wait_list->end()) {
     VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
@@ -222,7 +153,7 @@ TEST(pool2d_image2d_fp32, compute) {
 
   const size_t cl_image2d_row_pitch{0};
   const size_t cl_image2d_slice_pitch{0};
-  float* out_image_data = new float[out_image_shape.production() * 4];
+  half_t* out_image_data = new half_t[out_image_shape.production() * 4];
   TargetWrapperCL::ImgcpySync(out_image_data,
                               out_image,
                               out_image_shape[0],
@@ -235,12 +166,22 @@ TEST(pool2d_image2d_fp32, compute) {
       out_image_data, out_data, out_image_shape, out_dim);
 
   for (int i = 0; i < out_dim.production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref[i], 1e-6);
+    auto abs_diff = abs(out_data[i] - out_ref[i]);
+    auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]);
+    EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+              true);
+    if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+      LOG(ERROR) << "error idx:" << i << " out_data[" << i
+                 << "]:" << out_data[i] << " "
+                                           "out_ref["
+                 << i << "]:" << out_ref[i] << " abs_diff:" << abs_diff
+                 << " relative_diff:" << relative_diff
+                 << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+    }
   }
 }
 
 }  // namespace lite
 }  // namespace paddle
 
-// USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def);
-USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kImageDefault, image2d);
+USE_LITE_KERNEL(pool2d, kOpenCL, kFP16, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/relu_compute.cc b/lite/kernels/opencl/relu_compute.cc
deleted file mode 100644
index f1c78cb17c7aac62c9549ee427c218568840f19d..0000000000000000000000000000000000000000
--- a/lite/kernels/opencl/relu_compute.cc
+++ /dev/null
@@ -1,450 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/kernels/opencl/image_helper.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/replace_stl/stream.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-
-class ReluCompute
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  std::string doc() const override { return "Relu using cl::Buffer, kFloat"; }
-  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/relu_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    size_t count = x_dims.production();
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    auto* x_buf = param.X->data<float, cl::Buffer>();
-    auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
-
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, (const int)count);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-
-    auto global_work_size = cl::NDRange{count};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_buf, event_);
-  }
-
- private:
-  std::string kernel_func_name_{"relu"};
-  std::string build_options_{"-DCL_DTYPE_float -DRELU"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-class ReluComputeFloatImageDefault
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFloat),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  std::string doc() const override {
-    return "Relu using cl::Image2D(ImageDefault/RGBA), kFloat";
-  }
-
-  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/relu_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    auto* x_buf = param.X->data<float, cl::Image2D>();
-    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_buf = param.Out->mutable_data<float, cl::Image2D>(
-        image_shape["width"], image_shape["height"]);
-    const auto& y_dims = param.Out->dims();  // useless: check dim only
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
-    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-            << image_shape["height"];
-    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
-            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
-    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
-            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
-
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
-                    static_cast<cl::size_type>(image_shape["height"])};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(out_buf, event_);
-    context.cl_context()->GetCommandQueue().finish();
-  }
-
- private:
-  std::string kernel_func_name_{"relu"};
-  std::string build_options_{"-DCL_DTYPE_float -DRELU"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-class ReluComputeFP16ImageDefault
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFP16),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  std::string doc() const override {
-    return "Relu using cl::Image2D(ImageDefault/RGBA), kFP16";
-  }
-
-  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/relu_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    auto* x_buf =
-        param.X->data<int16_t,
-                      cl::Image2D>();  // use int16_t represents half float
-    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_buf =
-        param.Out->mutable_data<int16_t, cl::Image2D>(  // use int16_t
-                                                        // represents half float
-            image_shape["width"],
-            image_shape["height"]);
-    const auto& y_dims = param.Out->dims();  // useless: check dim only
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
-    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-            << image_shape["height"];
-    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
-            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
-    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
-            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
-
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
-                    static_cast<cl::size_type>(image_shape["height"])};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(out_buf, event_);
-    context.cl_context()->GetCommandQueue().finish();
-  }
-
- private:
-  std::string kernel_func_name_{"relu"};
-  std::string build_options_{"-DCL_DTYPE_half -DRELU"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-class Relu6ComputeFloatImageDefault
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFloat),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  std::string doc() const override {
-    return "Relu6 using cl::Image2D(ImageDefault/RGBA), kFloat";
-  }
-
-  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/relu6_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    auto* x_buf = param.X->data<float, cl::Image2D>();
-    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_buf = param.Out->mutable_data<float, cl::Image2D>(
-        image_shape["width"], image_shape["height"]);
-    const auto& y_dims = param.Out->dims();  // useless: check dim only
-    auto threshold = param.Relu_clipped_coef;
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, threshold);
-    CL_CHECK_FATAL(status);
-
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
-    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-            << image_shape["height"];
-    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
-            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
-    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
-            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
-    VLOG(4) << "threshold:" << threshold;
-
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
-                    static_cast<cl::size_type>(image_shape["height"])};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(out_buf, event_);
-    context.cl_context()->GetCommandQueue().finish();
-  }
-
- private:
-  std::string kernel_func_name_{"relu6"};
-  std::string build_options_{"-DCL_DTYPE_float -DRELU6"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-class Relu6ComputeFP16ImageDefault
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFP16),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  std::string doc() const override {
-    return "Relu6 using cl::Image2D(ImageDefault/RGBA), kFP16";
-  }
-
-  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/relu6_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    auto* x_buf = param.X->data<int16_t, cl::Image2D>();
-    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_buf = param.Out->mutable_data<int16_t, cl::Image2D>(
-        image_shape["width"], image_shape["height"]);
-    const auto& y_dims = param.Out->dims();  // useless: check dim only
-    auto threshold = param.Relu_clipped_coef;
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, threshold);
-    CL_CHECK_FATAL(status);
-
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
-    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-            << image_shape["height"];
-    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
-            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
-    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
-            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
-    VLOG(4) << "threshold:" << threshold;
-
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
-                    static_cast<cl::size_type>(image_shape["height"])};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(out_buf, event_);
-    context.cl_context()->GetCommandQueue().finish();
-  }
-
- private:
-  std::string kernel_func_name_{"relu6"};
-  std::string build_options_{"-DCL_DTYPE_half -DRELU6"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-// REGISTER_LITE_KERNEL(relu,`
-//                     kOpenCL,
-//                     kFloat,
-//                     kNCHW,
-//                     paddle::lite::kernels::opencl::ReluCompute,
-//                     def)
-//    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    relu,
-    kOpenCL,
-    kFloat,
-    kImageDefault,
-    paddle::lite::kernels::opencl::ReluComputeFloatImageDefault,
-    ImageDefault)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(relu,
-                     kOpenCL,
-                     kFP16,
-                     kImageDefault,
-                     paddle::lite::kernels::opencl::ReluComputeFP16ImageDefault,
-                     ImageDefault)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
-
-// Relu6
-REGISTER_LITE_KERNEL(
-    relu6,
-    kOpenCL,
-    kFloat,
-    kImageDefault,
-    paddle::lite::kernels::opencl::Relu6ComputeFloatImageDefault,
-    ImageDefault)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    relu6,
-    kOpenCL,
-    kFP16,
-    kImageDefault,
-    paddle::lite::kernels::opencl::Relu6ComputeFP16ImageDefault,
-    ImageDefault)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
diff --git a/lite/kernels/opencl/relu_compute_test.cc b/lite/kernels/opencl/relu_compute_test.cc
deleted file mode 100644
index cda214ceaf83553f6922e5f0b6a0e97de401c3ae..0000000000000000000000000000000000000000
--- a/lite/kernels/opencl/relu_compute_test.cc
+++ /dev/null
@@ -1,746 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/backends/opencl/target_wrapper.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/kernels/opencl/image_helper.h"
-
-namespace paddle {
-namespace lite {
-
-template <typename dtype>
-void relu_compute_ref(const dtype *x_data,
-                      const DDim &x_dim,
-                      dtype *out_data,
-                      float threshold = 0.f) {
-  if (abs(threshold) < 1e-5) {
-    // relu
-    for (int i = 0; i < x_dim.production(); ++i) {
-      out_data[i] = (x_data[i] > threshold) ? x_data[i] : threshold;
-    }
-  } else {
-    // relu6 or relu with threshold
-    for (int i = 0; i < x_dim.production(); ++i) {
-      auto out_tmp = (x_data[i] > 0) ? x_data[i] : 0;
-      out_data[i] = (out_tmp < threshold) ? out_tmp : threshold;
-    }
-  }
-}
-
-#if 0   // relu_buffer
-TEST(opencl_relu_buffer, compute) {
-  // prepare data
-  const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
-  lite::Tensor x, out;
-  x.Resize(x_dim);
-  out.Resize(x_dim);
-
-  auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-10, 10);
-  auto *mapped_x = static_cast<float *>(
-      TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
-  for (int i = 0; i < x_dim.production(); i++) {
-    mapped_x[i] = dist(engine);
-  }
-
-  // set param and kernel, then run
-  operators::ActivationParam param;
-  param.X = &x;
-  param.Out = &out;
-
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-  auto kernels = KernelRegistry::Global().Create(
-      "relu", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-  auto kernel = std::move(kernels.front());
-  kernel->SetParam(param);
-  std::unique_ptr<KernelContext> relu_context(new KernelContext);
-  context->As<OpenCLContext>().CopySharedTo(
-      &(relu_context->As<OpenCLContext>()));
-  kernel->SetContext(std::move(relu_context));
-
-  kernel->Launch();
-
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto *out_ptr = param.Out->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
-
-  // run compute ref and check
-  std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
-  relu_compute_ref<float>(mapped_x, x_dim, out_ref.get());
-
-  auto *out_data = out.mutable_data<float, cl::Buffer>();
-  auto *mapped_out = static_cast<float *>(
-      TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
-  for (int i = 0; i < x_dim.production(); i++) {
-    EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
-  }
-  TargetWrapperCL::Unmap(out_data, mapped_out);
-  TargetWrapperCL::Unmap(x_data, mapped_x);
-}
-#endif  // relu_buffer
-
-// #define LOOP_TEST
-// #define PRINT_RESULT
-TEST(relu_image2d_fp32, compute) {
-  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu(img) -> "
-               "layout(img2buf) "
-               "-> host";
-
-#ifdef LOOP_TEST
-  for (int n = 1; n <= 100; n += 33) {
-    for (auto c : {1, 3}) {
-      for (int h = 12; h <= 100; h += 13) {
-        for (int w = 12; w <= 100; w += 25) {
-#else
-  const int n = 1;
-  const int c = 2;
-  const int h = 3;
-  const int w = 4;
-#endif  // LOOP_TEST
-
-          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
-                    << h << " " << w << " ========";
-          // set layout kernels
-          auto buf_to_img_kernels =
-              KernelRegistry::Global().Create("layout",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kAny),
-                                              DATALAYOUT(kImageDefault));
-          auto img_to_buf_kernels = KernelRegistry::Global().Create(
-              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
-          auto relu_img_kernels =
-              KernelRegistry::Global().Create("relu",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kFloat),
-                                              DATALAYOUT(kImageDefault));
-          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(relu_img_kernels.empty());
-
-          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
-          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
-          auto relu_img_kernel = std::move(relu_img_kernels.front());
-          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
-          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
-          LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc();
-
-          // set tensors about op param
-          LOG(INFO) << "set tensors about op param";
-          // layout(buf->img): x -> relu_in
-          // relu(img): relu_in -> relu_out
-          // layout(img->buf): relu_out -> y
-          lite::Tensor x, y, relu_in, relu_out, y_ref;
-          operators::LayoutParam BufferToImageParam;
-          operators::LayoutParam ImageToBufferParam;
-          BufferToImageParam.x = &x;
-          BufferToImageParam.y = &relu_in;
-          ImageToBufferParam.x = &relu_out;
-          ImageToBufferParam.y = &y;
-          operators::ActivationParam ReluParam;
-          ReluParam.X = &relu_in;
-          ReluParam.Out = &relu_out;
-
-          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
-          x.Resize(x_dim);
-          y.Resize(x_dim);
-          relu_in.Resize(x_dim);
-          relu_out.Resize(x_dim);
-          y_ref.Resize(x_dim);
-          auto relu_image2d_shape =
-              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
-
-          // initialize tensors
-          LOG(INFO) << "initialize tensors";
-          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
-          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
-              x_data, 0, sizeof(float) * x_dim.production()));
-          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
-              y_data, 0, sizeof(float) * x_dim.production()));
-          for (int i = 0; i < x_dim.production(); ++i) {
-            mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
-            mapped_y[i] = static_cast<int>(0);
-          }
-          auto *relu_in_data = relu_in.mutable_data<float, cl::Image2D>(
-              relu_image2d_shape["width"], relu_image2d_shape["height"]);
-          auto *relu_out_data = relu_out.mutable_data<float, cl::Image2D>(
-              relu_image2d_shape["width"], relu_image2d_shape["height"]);
-
-          // set context and kernel args
-          LOG(INFO) << "set context and kernel args";
-          std::unique_ptr<KernelContext> context(new KernelContext);
-          context->As<OpenCLContext>().InitOnce();
-
-          buf_to_img_kernel->SetParam(BufferToImageParam);
-          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(buf_to_img_context->As<OpenCLContext>()));
-          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
-
-          img_to_buf_kernel->SetParam(ImageToBufferParam);
-          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(img_to_buf_context->As<OpenCLContext>()));
-          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
-
-          relu_img_kernel->SetParam(ReluParam);
-          std::unique_ptr<KernelContext> relu_img_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(relu_img_context->As<OpenCLContext>()));
-          relu_img_kernel->SetContext(std::move(relu_img_context));
-
-          // run kernels
-          LOG(INFO) << "run kernel: buf_to_img_kernel";
-          buf_to_img_kernel->Launch();
-          LOG(INFO) << "run kernel: relu_img_kernel";
-          relu_img_kernel->Launch();
-          LOG(INFO) << "run kernel: img_to_buf_kernel";
-          img_to_buf_kernel->Launch();
-
-          // compute ref cpu
-          relu_compute_ref<float>(mapped_x, x_dim, y_data_ref);
-// result
-#ifdef PRINT_RESULT
-          LOG(INFO) << "---- print kernel result (input -> output) ----";
-          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
-            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
-                      << std::endl;
-          }
-#endif  // PRINT_RESULT
-
-          // check result: compare kernel output and cpu output(y_data_ref)
-          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
-            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
-            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
-              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
-                        << " / " << x_dim.production() << ", y_data_ref["
-                        << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
-                        << eidx << "]:" << mapped_y[eidx];
-              break;
-            }
-          }
-
-          // free
-          LOG(INFO) << "free: unmap x, y";
-          TargetWrapperCL::Unmap(x_data, mapped_x);
-          TargetWrapperCL::Unmap(y_data, mapped_y);
-#ifdef LOOP_TEST
-        }  // w
-      }    // h
-    }      // c
-  }        // n
-#else
-// nothing to do.
-#endif
-}
-
-TEST(relu_image2d_fp16, compute) {
-  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu(img) -> "
-               "layout(img2buf) "
-               "-> host";
-
-#ifdef RELU_FP16_LOOP_TEST
-  for (int n = 1; n <= 100; n += 33) {
-    for (auto c : {1, 3}) {
-      for (int h = 12; h <= 100; h += 13) {
-        for (int w = 12; w <= 100; w += 25) {
-#else
-  const int n = 1;
-  const int c = 2;
-  const int h = 3;
-  const int w = 4;
-#endif  // RELU_FP16_LOOP_TEST
-
-          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
-                    << h << " " << w << " ========";
-          // set layout kernels
-          auto buf_to_img_kernels =
-              KernelRegistry::Global().Create("layout",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kAny),
-                                              DATALAYOUT(kImageDefault));
-          auto img_to_buf_kernels = KernelRegistry::Global().Create(
-              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
-          auto relu_img_kernels =
-              KernelRegistry::Global().Create("relu",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kFP16),
-                                              DATALAYOUT(kImageDefault));
-          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(relu_img_kernels.empty());
-
-          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
-          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
-          auto relu_img_kernel = std::move(relu_img_kernels.front());
-          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
-          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
-          LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc();
-
-          // set tensors about op param
-          LOG(INFO) << "set tensors about op param";
-          // layout(buf->img): x -> relu_in
-          // relu(img): relu_in -> relu_out
-          // layout(img->buf): relu_out -> y
-          lite::Tensor x, y, relu_in, relu_out, y_ref;
-          operators::LayoutParam BufferToImageParam;
-          operators::LayoutParam ImageToBufferParam;
-          BufferToImageParam.x = &x;
-          BufferToImageParam.y = &relu_in;
-          ImageToBufferParam.x = &relu_out;
-          ImageToBufferParam.y = &y;
-          operators::ActivationParam ReluParam;
-          ReluParam.X = &relu_in;
-          ReluParam.Out = &relu_out;
-
-          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
-          x.Resize(x_dim);
-          y.Resize(x_dim);
-          relu_in.Resize(x_dim);
-          relu_out.Resize(x_dim);
-          y_ref.Resize(x_dim);
-          auto relu_image2d_shape =
-              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
-
-          // initialize tensors
-          LOG(INFO) << "initialize tensors";
-          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
-          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
-              x_data, 0, sizeof(float) * x_dim.production()));
-          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
-              y_data, 0, sizeof(float) * x_dim.production()));
-          for (int i = 0; i < x_dim.production(); ++i) {
-            mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
-            mapped_y[i] = static_cast<int>(0);
-          }
-          auto *relu_in_data = relu_in.mutable_data<int16_t, cl::Image2D>(
-              relu_image2d_shape["width"], relu_image2d_shape["height"]);
-          auto *relu_out_data = relu_out.mutable_data<int16_t, cl::Image2D>(
-              relu_image2d_shape["width"], relu_image2d_shape["height"]);
-
-          // set context and kernel args
-          LOG(INFO) << "set context and kernel args";
-          std::unique_ptr<KernelContext> context(new KernelContext);
-          context->As<OpenCLContext>().InitOnce();
-
-          buf_to_img_kernel->SetParam(BufferToImageParam);
-          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(buf_to_img_context->As<OpenCLContext>()));
-          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
-
-          img_to_buf_kernel->SetParam(ImageToBufferParam);
-          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(img_to_buf_context->As<OpenCLContext>()));
-          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
-
-          relu_img_kernel->SetParam(ReluParam);
-          std::unique_ptr<KernelContext> relu_img_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(relu_img_context->As<OpenCLContext>()));
-          relu_img_kernel->SetContext(std::move(relu_img_context));
-
-          // run kernels
-          LOG(INFO) << "run kernel: buf_to_img_kernel";
-          buf_to_img_kernel->Launch();
-          LOG(INFO) << "run kernel: relu_img_kernel";
-          relu_img_kernel->Launch();
-          LOG(INFO) << "run kernel: img_to_buf_kernel";
-          img_to_buf_kernel->Launch();
-
-          // compute ref cpu
-          relu_compute_ref<float>(mapped_x, x_dim, y_data_ref);
-// result
-#ifdef RELU_FP16_PRINT_RESULT
-          LOG(INFO) << "---- print kernel result (input -> output) ----";
-          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
-            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
-                      << std::endl;
-          }
-#endif  // RELU_FP16_PRINT_RESULT
-
-          // check result: compare kernel output and cpu output(y_data_ref)
-          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
-            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
-            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
-              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
-                        << " / " << x_dim.production() << ", y_data_ref["
-                        << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
-                        << eidx << "]:" << mapped_y[eidx];
-              break;
-            }
-          }
-
-          // free
-          LOG(INFO) << "free: unmap x, y";
-          TargetWrapperCL::Unmap(x_data, mapped_x);
-          TargetWrapperCL::Unmap(y_data, mapped_y);
-#ifdef RELU_FP16_LOOP_TEST
-        }  // w
-      }    // h
-    }      // c
-  }        // n
-#else
-// nothing to do.
-#endif
-}
-
-// #define RELU6_FP32_LOOP_TEST
-// #define RELU6_FP32_PRINT_RESULT
-TEST(relu6_image2d_fp32, compute) {
-  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu6(img) -> "
-               "layout(img2buf) "
-               "-> host";
-
-#ifdef RELU6_FP32_LOOP_TEST
-  for (int n = 1; n <= 100; n += 33) {
-    for (auto c : {1, 3}) {
-      for (int h = 12; h <= 100; h += 13) {
-        for (int w = 12; w <= 100; w += 25) {
-#else
-  const int n = 1;
-  const int c = 2;
-  const int h = 3;
-  const int w = 4;
-#endif  // RELU6_FP32_LOOP_TEST
-
-          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
-                    << h << " " << w << " ========";
-          // set layout kernels
-          auto buf_to_img_kernels =
-              KernelRegistry::Global().Create("layout",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kAny),
-                                              DATALAYOUT(kImageDefault));
-          auto img_to_buf_kernels = KernelRegistry::Global().Create(
-              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
-          auto relu_img_kernels =
-              KernelRegistry::Global().Create("relu6",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kFloat),
-                                              DATALAYOUT(kImageDefault));
-          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(relu_img_kernels.empty());
-
-          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
-          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
-          auto relu_img_kernel = std::move(relu_img_kernels.front());
-          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
-          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
-          LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc();
-
-          // set tensors about op param
-          LOG(INFO) << "set tensors about op param";
-          // layout(buf->img): x -> relu_in
-          // relu(img): relu_in -> relu_out
-          // layout(img->buf): relu_out -> y
-          lite::Tensor x, y, relu_in, relu_out, y_ref;
-          operators::LayoutParam BufferToImageParam;
-          operators::LayoutParam ImageToBufferParam;
-          BufferToImageParam.x = &x;
-          BufferToImageParam.y = &relu_in;
-          ImageToBufferParam.x = &relu_out;
-          ImageToBufferParam.y = &y;
-          operators::ActivationParam ReluParam;
-          ReluParam.X = &relu_in;
-          ReluParam.Out = &relu_out;
-          ReluParam.Relu_clipped_coef = 6.f;
-
-          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
-          x.Resize(x_dim);
-          y.Resize(x_dim);
-          relu_in.Resize(x_dim);
-          relu_out.Resize(x_dim);
-          y_ref.Resize(x_dim);
-          auto relu_image2d_shape =
-              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
-
-          // initialize tensors
-          LOG(INFO) << "initialize tensors";
-          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
-          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
-              x_data, 0, sizeof(float) * x_dim.production()));
-          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
-              y_data, 0, sizeof(float) * x_dim.production()));
-          for (int i = 0; i < x_dim.production(); ++i) {
-            mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
-            mapped_y[i] = static_cast<int>(0);
-          }
-          auto *relu_in_data = relu_in.mutable_data<float, cl::Image2D>(
-              relu_image2d_shape["width"], relu_image2d_shape["height"]);
-          auto *relu_out_data = relu_out.mutable_data<float, cl::Image2D>(
-              relu_image2d_shape["width"], relu_image2d_shape["height"]);
-
-          // set context and kernel args
-          LOG(INFO) << "set context and kernel args";
-          std::unique_ptr<KernelContext> context(new KernelContext);
-          context->As<OpenCLContext>().InitOnce();
-
-          buf_to_img_kernel->SetParam(BufferToImageParam);
-          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(buf_to_img_context->As<OpenCLContext>()));
-          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
-
-          img_to_buf_kernel->SetParam(ImageToBufferParam);
-          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(img_to_buf_context->As<OpenCLContext>()));
-          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
-
-          relu_img_kernel->SetParam(ReluParam);
-          std::unique_ptr<KernelContext> relu_img_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(relu_img_context->As<OpenCLContext>()));
-          relu_img_kernel->SetContext(std::move(relu_img_context));
-
-          // run kernels
-          LOG(INFO) << "run kernel: buf_to_img_kernel";
-          buf_to_img_kernel->Launch();
-          LOG(INFO) << "run kernel: relu_img_kernel";
-          relu_img_kernel->Launch();
-          LOG(INFO) << "run kernel: img_to_buf_kernel";
-          img_to_buf_kernel->Launch();
-
-          // compute ref cpu
-          relu_compute_ref<float>(mapped_x, x_dim, y_data_ref, 6.f);
-// result
-#ifdef RELU6_FP32_PRINT_RESULT
-          LOG(INFO) << "---- print kernel result (input -> output) ----";
-          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
-            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
-                      << std::endl;
-          }
-#endif  // RELU6_FP32_PRINT_RESULT
-
-          // check result: compare kernel output and cpu output(y_data_ref)
-          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
-            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
-            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
-              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
-                        << " / " << x_dim.production() << ", y_data_ref["
-                        << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
-                        << eidx << "]:" << mapped_y[eidx];
-              break;
-            }
-          }
-
-          // free
-          LOG(INFO) << "free: unmap x, y";
-          TargetWrapperCL::Unmap(x_data, mapped_x);
-          TargetWrapperCL::Unmap(y_data, mapped_y);
-#ifdef RELU6_FP32_LOOP_TEST
-        }  // w
-      }    // h
-    }      // c
-  }        // n
-#else
-// nothing to do.
-#endif
-}
-
-// #define RELU6_FP16_LOOP_TEST
-// #define RELU6_FP16_PRINT_RESULT
-TEST(relu6_image2d_fp16, compute) {
-  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu6(img) -> "
-               "layout(img2buf) "
-               "-> host";
-
-#ifdef RELU6_FP16_LOOP_TEST
-  for (int n = 1; n <= 100; n += 33) {
-    for (auto c : {1, 3}) {
-      for (int h = 12; h <= 100; h += 13) {
-        for (int w = 12; w <= 100; w += 25) {
-#else
-  const int n = 1;
-  const int c = 2;
-  const int h = 3;
-  const int w = 4;
-#endif  // RELU6_FP16_LOOP_TEST
-
-          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
-                    << h << " " << w << " ========";
-          // set layout kernels
-          auto buf_to_img_kernels =
-              KernelRegistry::Global().Create("layout",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kAny),
-                                              DATALAYOUT(kImageDefault));
-          auto img_to_buf_kernels = KernelRegistry::Global().Create(
-              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
-          auto relu_img_kernels =
-              KernelRegistry::Global().Create("relu6",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kFloat),
-                                              DATALAYOUT(kImageDefault));
-          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(relu_img_kernels.empty());
-
-          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
-          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
-          auto relu_img_kernel = std::move(relu_img_kernels.front());
-          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
-          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
-          LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc();
-
-          // set tensors about op param
-          LOG(INFO) << "set tensors about op param";
-          // layout(buf->img): x -> relu_in
-          // relu(img): relu_in -> relu_out
-          // layout(img->buf): relu_out -> y
-          lite::Tensor x, y, relu_in, relu_out, y_ref;
-          operators::LayoutParam BufferToImageParam;
-          operators::LayoutParam ImageToBufferParam;
-          BufferToImageParam.x = &x;
-          BufferToImageParam.y = &relu_in;
-          ImageToBufferParam.x = &relu_out;
-          ImageToBufferParam.y = &y;
-          operators::ActivationParam ReluParam;
-          ReluParam.X = &relu_in;
-          ReluParam.Out = &relu_out;
-          ReluParam.Relu_clipped_coef = 6.f;
-
-          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
-          x.Resize(x_dim);
-          y.Resize(x_dim);
-          relu_in.Resize(x_dim);
-          relu_out.Resize(x_dim);
-          y_ref.Resize(x_dim);
-          auto relu_image2d_shape =
-              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
-
-          // initialize tensors
-          LOG(INFO) << "initialize tensors";
-          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
-          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
-              x_data, 0, sizeof(float) * x_dim.production()));
-          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
-              y_data, 0, sizeof(float) * x_dim.production()));
-          for (int i = 0; i < x_dim.production(); ++i) {
-            mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
-            mapped_y[i] = static_cast<int>(0);
-          }
-          auto *relu_in_data = relu_in.mutable_data<int16_t, cl::Image2D>(
-              relu_image2d_shape["width"], relu_image2d_shape["height"]);
-          auto *relu_out_data = relu_out.mutable_data<int16_t, cl::Image2D>(
-              relu_image2d_shape["width"], relu_image2d_shape["height"]);
-
-          // set context and kernel args
-          LOG(INFO) << "set context and kernel args";
-          std::unique_ptr<KernelContext> context(new KernelContext);
-          context->As<OpenCLContext>().InitOnce();
-
-          buf_to_img_kernel->SetParam(BufferToImageParam);
-          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(buf_to_img_context->As<OpenCLContext>()));
-          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
-
-          img_to_buf_kernel->SetParam(ImageToBufferParam);
-          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(img_to_buf_context->As<OpenCLContext>()));
-          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
-
-          relu_img_kernel->SetParam(ReluParam);
-          std::unique_ptr<KernelContext> relu_img_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(relu_img_context->As<OpenCLContext>()));
-          relu_img_kernel->SetContext(std::move(relu_img_context));
-
-          // run kernels
-          LOG(INFO) << "run kernel: buf_to_img_kernel";
-          buf_to_img_kernel->Launch();
-          LOG(INFO) << "run kernel: relu_img_kernel";
-          relu_img_kernel->Launch();
-          LOG(INFO) << "run kernel: img_to_buf_kernel";
-          img_to_buf_kernel->Launch();
-
-          // compute ref cpu
-          relu_compute_ref<float>(mapped_x, x_dim, y_data_ref, 6.f);
-// result
-#ifdef RELU6_FP16_PRINT_RESULT
-          LOG(INFO) << "---- print kernel result (input -> output) ----";
-          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
-            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
-                      << std::endl;
-          }
-#endif  // RELU6_FP16_PRINT_RESULT
-
-          // check result: compare kernel output and cpu output(y_data_ref)
-          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
-            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
-            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
-              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
-                        << " / " << x_dim.production() << ", y_data_ref["
-                        << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
-                        << eidx << "]:" << mapped_y[eidx];
-              break;
-            }
-          }
-
-          // free
-          LOG(INFO) << "free: unmap x, y";
-          TargetWrapperCL::Unmap(x_data, mapped_x);
-          TargetWrapperCL::Unmap(y_data, mapped_y);
-#ifdef RELU6_FP16_LOOP_TEST
-        }  // w
-      }    // h
-    }      // c
-  }        // n
-#else
-// nothing to do.
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
-
-// relu buffer
-// USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def);
-
-// relu image2d fp32
-USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
-USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
-USE_LITE_KERNEL(relu, kOpenCL, kFloat, kImageDefault, ImageDefault);
-
-// relu image2d fp16
-USE_LITE_KERNEL(relu, kOpenCL, kFP16, kImageDefault, ImageDefault);
-
-// relu6 image2d fp32
-USE_LITE_KERNEL(relu6, kOpenCL, kFloat, kImageDefault, ImageDefault);
-USE_LITE_KERNEL(relu6, kOpenCL, kFP16, kImageDefault, ImageDefault);
diff --git a/lite/kernels/opencl/reshape_compute.cc b/lite/kernels/opencl/reshape_image_compute.cc
similarity index 74%
rename from lite/kernels/opencl/reshape_compute.cc
rename to lite/kernels/opencl/reshape_image_compute.cc
index 7af648c5601e0a516eb92b3090cb8d7e836a5447..376add226216a57a0868c9c52497b784929a207e 100644
--- a/lite/kernels/opencl/reshape_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "lite/backends/opencl/cl_half.h"
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
@@ -27,20 +28,19 @@ namespace opencl {
 
 // reshape operator
 class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
-                                                   PRECISION(kFloat),
+                                                   PRECISION(kFP16),
                                                    DATALAYOUT(kImageDefault)> {
  public:
   using param_t = operators::ReshapeParam;
 
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
     context.cl_context()->AddKernel(
         kernel_func_name_, "image/reshape_kernel.cl", build_options_);
   }
 
   void Run() override {
-    VLOG(4) << "reshape_compute run ... ";
-
     auto& param = *param_.get_mutable<param_t>();
     const Tensor* const x = param.x;
 
@@ -51,7 +51,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
     const int64_t& input_image_width = input_image_shape.at("width");
     const int64_t& input_image_height = input_image_shape.at("height");
 
-    const cl::Image2D* const x_image = x->data<float, cl::Image2D>();
+    const cl::Image2D* const x_image = x->data<half_t, cl::Image2D>();
 
     const std::vector<int>& shape_vct = param.shape_vct;
     Tensor* const output = param.output;
@@ -60,10 +60,11 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
 
     const std::map<std::string, size_t>& out_image_shape =
         InitImageDimInfoWith(out_dims);
-    cl::Image2D* const out_image = output->mutable_data<float, cl::Image2D>(
+    cl::Image2D* const out_image = output->mutable_data<half_t, cl::Image2D>(
         out_image_shape.at("width"), out_image_shape.at("height"));
-    LOG(INFO) << "out_dims=   " << out_dims;
-
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "out_dims=   " << out_dims;
+#endif
     const std::vector<size_t>& default_work_size = DefaultWorkSize(
         out_dims,
         DDim(std::vector<DDim::value_type>{
@@ -92,6 +93,8 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
     int out_Stride0 = out_W;
     int out_Stride1 = out_H * out_W;
     int out_Stride2 = out_C * out_H * out_W;
+
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << "out_C=" << out_C;
     VLOG(4) << "out_H=" << out_H;
     VLOG(4) << "out_W=" << out_W;
@@ -102,17 +105,20 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
     VLOG(4) << "in_Stride1=" << in_Stride1;
     VLOG(4) << "out_Stride0=" << out_Stride0;
     VLOG(4) << "out_Stride1=" << out_Stride1;
+#endif
 
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     STL::stringstream kernel_key;
     kernel_key << kernel_func_name_ << build_options_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+#ifndef LITE_SHUTDOWN_LOG
     VLOG(4) << TargetToStr(x->target());
     VLOG(4) << TargetToStr(param.output->target());
+#endif
 
     int arg_idx = 0;
-
     cl_int status;
     status = kernel.setArg(arg_idx, *x_image);
     CL_CHECK_FATAL(status);
@@ -159,7 +165,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
 
  private:
   std::string kernel_func_name_{"reshape"};
-  std::string build_options_{"-DCL_DTYPE_float "};
+  std::string build_options_{"-DCL_DTYPE_half"};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
@@ -170,37 +176,72 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
 
 REGISTER_LITE_KERNEL(reshape,
                      kOpenCL,
-                     kFloat,
+                     kFP16,
                      kImageDefault,
                      paddle::lite::kernels::opencl::ReshapeComputeFloatImage,
                      image2d)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
     .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))})
     .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
                                        DATALAYOUT(kImageDefault))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(reshape2,
                      kOpenCL,
-                     kFloat,
+                     kFP16,
                      kImageDefault,
                      paddle::lite::kernels::opencl::ReshapeComputeFloatImage,
                      image2d)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
     .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(flatten,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ReshapeComputeFloatImage,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(flatten2,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ReshapeComputeFloatImage,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
     .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
                                        DATALAYOUT(kImageDefault))})
     .Finalize();
diff --git a/lite/kernels/opencl/reshape_compute_test.cc b/lite/kernels/opencl/reshape_image_compute_test.cc
similarity index 83%
rename from lite/kernels/opencl/reshape_compute_test.cc
rename to lite/kernels/opencl/reshape_image_compute_test.cc
index d5ba1c118e7fa952fe1172080ee97555a82c7260..950e0978558a45fe27142e8ee0b72c0b1442b79b 100644
--- a/lite/kernels/opencl/reshape_compute_test.cc
+++ b/lite/kernels/opencl/reshape_image_compute_test.cc
@@ -17,9 +17,12 @@
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
 #include "lite/operators/reshape_op.h"
 #include "lite/utils/logging.h"
 
+#define FP16_MAX_DIFF (5e-1)
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -81,7 +84,7 @@ static DDim ValidateShape(const std::vector<int>& shape,
 TEST(reshape_opencl, compute) {
   LOG(INFO) << "to get kernel ...";
   auto kernels = KernelRegistry::Global().Create(
-      "reshape", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault));
+      "reshape", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
   ASSERT_FALSE(kernels.empty());
   auto kernel = std::move(kernels.front());
 
@@ -149,13 +152,13 @@ TEST(reshape_opencl, compute) {
   }
   paddle::lite::CLImageConverterDefault default_convertor;
 
-  std::vector<float> x_image_data(input_image_width * input_image_height *
-                                  4);  // 4 : RGBA
+  std::vector<half_t> x_image_data(input_image_width * input_image_height *
+                                   4);  // 4 : RGBA
 
   LOG(INFO) << "set mapped input  ...";
   default_convertor.NCHWToImage(input_v_data, x_image_data.data(), input_dim);
 
-  auto* input_image = input.mutable_data<float, cl::Image2D>(
+  auto* input_image = input.mutable_data<half_t, cl::Image2D>(
       input_image_width, input_image_height, x_image_data.data());
 
   LOG(INFO) << "prepare kernel ready";
@@ -165,8 +168,8 @@ TEST(reshape_opencl, compute) {
   DDim out_image_shape = default_converter.InitImageDimInfoWith(output_dim);
   LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
             << out_image_shape[1];
-  auto* out_image = output.mutable_data<float, cl::Image2D>(out_image_shape[0],
-                                                            out_image_shape[1]);
+  auto* out_image = output.mutable_data<half_t, cl::Image2D>(
+      out_image_shape[0], out_image_shape[1]);
   VLOG(4) << "out_dims= " << output_dim;
 
   LOG(INFO) << "kernel context ...";
@@ -182,7 +185,7 @@ TEST(reshape_opencl, compute) {
   kernel->Launch();
 
   auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<float, cl::Image2D>();
+  auto* out_ptr = param.output->data<half_t, cl::Image2D>();
   auto it = wait_list->find(out_image);
 
   if (it != wait_list->end()) {
@@ -193,9 +196,9 @@ TEST(reshape_opencl, compute) {
     LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
   }
 
-  float* out_image_data = new float[out_image_shape.production() * 4];
+  half_t* out_image_data = new half_t[out_image_shape.production() * 4];
   TargetWrapperCL::ImgcpySync(out_image_data,
-                              output.data<float, cl::Image2D>(),
+                              output.data<half_t, cl::Image2D>(),
                               out_image_shape[0],
                               out_image_shape[1],
                               cl_image2d_row_pitch,
@@ -211,9 +214,17 @@ TEST(reshape_opencl, compute) {
 
   // check output data
   for (int i = 0; i < output.numel(); i++) {
-    EXPECT_NEAR(out_data[i], input_v_data[i], 1e-3);
-    if (abs(out_data[i] - input_v_data[i]) > 1e-3) {
-      LOG(INFO) << "error idx:" << i;
+    auto abs_diff = abs(out_data[i] - input_v_data[i]);
+    auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], input_v_data[i]);
+    EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+              true);
+    if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+      LOG(ERROR) << "error idx:" << i << " out_data[" << i
+                 << "]:" << out_data[i] << " "
+                                           "input_v_data["
+                 << i << "]:" << input_v_data[i] << " abs_diff:" << abs_diff
+                 << " relative_diff:" << relative_diff
+                 << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
     }
   }
 }
@@ -223,5 +234,5 @@ TEST(reshape_opencl, compute) {
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(reshape, kOpenCL, kFloat, kImageDefault, image2d);
-USE_LITE_KERNEL(reshape2, kOpenCL, kFloat, kImageDefault, image2d);
+USE_LITE_KERNEL(reshape, kOpenCL, kFP16, kImageDefault, image2d);
+USE_LITE_KERNEL(reshape2, kOpenCL, kFP16, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/scale_compute.cc b/lite/kernels/opencl/scale_image_compute.cc
similarity index 82%
rename from lite/kernels/opencl/scale_compute.cc
rename to lite/kernels/opencl/scale_image_compute.cc
index 6a7d4d4f61d452bb6193277766ecf94fd6034c6b..5fd9a2b46b5ce3b0ad84449785f510d5f0391250 100644
--- a/lite/kernels/opencl/scale_compute.cc
+++ b/lite/kernels/opencl/scale_image_compute.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <vector>
+#include "lite/backends/opencl/cl_half.h"
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
@@ -27,15 +28,16 @@ namespace kernels {
 namespace opencl {
 
 class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
-                                              PRECISION(kFloat),
+                                              PRECISION(kFP16),
                                               DATALAYOUT(kImageDefault)> {
  public:
   using param_t = operators::ScaleParam;
 
-  std::string doc() const override { return "Scale using cl::Image2D, kFloat"; }
+  std::string doc() const override { return "Scale using cl::Image2D, kFP16"; }
 
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
     context.cl_context()->AddKernel(
         kernel_func_name_, "image/scale_kernel.cl", build_options_);
   }
@@ -43,17 +45,19 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
   void Run() override {
     const auto& param = *param_.get_mutable<param_t>();
     const auto& in_dims = param.x->dims();
-    auto* x_img = param.x->data<float, cl::Image2D>();
+    auto* x_img = param.x->data<half_t, cl::Image2D>();
     const float scale = param.scale;
     const float bias = param.bias;
 
-    LOG(INFO) << "x_image" << x_img;
+    //    LOG(INFO) << "x_image" << x_img;
     auto out_image_shape = InitImageDimInfoWith(in_dims);
-    LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " "
-              << out_image_shape["height"];
-    auto* out_img = param.output->mutable_data<float, cl::Image2D>(
+#ifndef LITE_SHUTDOWN_LOG
+    VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
+            << out_image_shape["height"];
+#endif
+    auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
         out_image_shape["width"], out_image_shape["height"]);
-    LOG(INFO) << "out_image" << out_img;
+    //    LOG(INFO) << "out_image" << out_img;
 
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
@@ -89,7 +93,7 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
 
  private:
   std::string kernel_func_name_{"scale"};
-  std::string build_options_{"-DCL_DTYPE_float"};
+  std::string build_options_{"-DCL_DTYPE_half"};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
@@ -100,16 +104,16 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
 
 REGISTER_LITE_KERNEL(scale,
                      kOpenCL,
-                     kFloat,
+                     kFP16,
                      kImageDefault,
                      paddle::lite::kernels::opencl::ScaleComputeImage2D,
                      image2d)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
+                                      PRECISION(kFP16),
                                       DATALAYOUT(kImageDefault))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
+                                       PRECISION(kFP16),
                                        DATALAYOUT(kImageDefault))})
     .Finalize();
diff --git a/lite/kernels/opencl/scale_compute_test.cc b/lite/kernels/opencl/scale_image_compute_test.cc
similarity index 76%
rename from lite/kernels/opencl/scale_compute_test.cc
rename to lite/kernels/opencl/scale_image_compute_test.cc
index 72381fee4f62e029172286fd70aae9fcd6380515..c9461ffbb8a2941fc51c58f7ddaa42293a4a5ffe 100644
--- a/lite/kernels/opencl/scale_compute_test.cc
+++ b/lite/kernels/opencl/scale_image_compute_test.cc
@@ -18,6 +18,9 @@
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+
+#define FP16_MAX_DIFF (5e-1)
 
 namespace paddle {
 namespace lite {
@@ -35,7 +38,7 @@ void scale(const float* input_data,
 TEST(scale_image2d_fp32, compute) {
   LOG(INFO) << "to get kernel ...";
   auto kernels = KernelRegistry::Global().Create(
-      "scale", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault));
+      "scale", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
   ASSERT_FALSE(kernels.empty());
 
   auto kernel = std::move(kernels.front());
@@ -74,19 +77,19 @@ TEST(scale_image2d_fp32, compute) {
   CLImageConverterDefault* default_converter = new CLImageConverterDefault();
   DDim image_shape = default_converter->InitImageDimInfoWith(in_dim);
   LOG(INFO) << "image_shape = " << image_shape[0] << " " << image_shape[1];
-  std::vector<float> x_image_data(image_shape.production() * 4);  // 4 : RGBA
+  std::vector<half_t> x_image_data(image_shape.production() * 4);  // 4 : RGBA
   default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
-  auto* x_image = x.mutable_data<float, cl::Image2D>(
+  auto* x_image = x.mutable_data<half_t, cl::Image2D>(
       image_shape[0], image_shape[1], x_image_data.data());
   LOG(INFO) << "x_image:" << x_image;
 
   auto* out_image =
-      out.mutable_data<float, cl::Image2D>(image_shape[0], image_shape[1]);
+      out.mutable_data<half_t, cl::Image2D>(image_shape[0], image_shape[1]);
   LOG(INFO) << "out_image:" << out_image;
   kernel->Launch();
 
   auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto* out_ptr = param.output->data<float, cl::Image2D>();
+  auto* out_ptr = param.output->data<half_t, cl::Image2D>();
   auto it = wait_list->find(out_ptr);
   if (it != wait_list->end()) {
     VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
@@ -101,7 +104,7 @@ TEST(scale_image2d_fp32, compute) {
 
   const size_t cl_image2d_row_pitch{0};
   const size_t cl_image2d_slice_pitch{0};
-  float* out_image_data = new float[image_shape.production() * 4];
+  half_t* out_image_data = new half_t[image_shape.production() * 4];
   TargetWrapperCL::ImgcpySync(out_image_data,
                               out_image,
                               image_shape[0],
@@ -114,11 +117,22 @@ TEST(scale_image2d_fp32, compute) {
       out_image_data, out_data, image_shape, out_dim);
 
   for (int i = 0; i < out_dim.production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref[i], 1e-6);
+    auto abs_diff = abs(out_data[i] - out_ref[i]);
+    auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]);
+    EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+              true);
+    if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+      LOG(ERROR) << "error idx:" << i << " out_data[" << i
+                 << "]:" << out_data[i] << " "
+                                           "out_ref["
+                 << i << "]:" << out_ref[i] << " abs_diff:" << abs_diff
+                 << " relative_diff:" << relative_diff
+                 << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+    }
   }
 }
 
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(scale, kOpenCL, kFloat, kImageDefault, image2d);
+USE_LITE_KERNEL(scale, kOpenCL, kFP16, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/sigmoid_compute.cc b/lite/kernels/opencl/sigmoid_compute.cc
deleted file mode 100644
index 2dae9d1ae70c0cc7e0bcd6781061c1f3fc7d927b..0000000000000000000000000000000000000000
--- a/lite/kernels/opencl/sigmoid_compute.cc
+++ /dev/null
@@ -1,272 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/opencl/cl_include.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/kernels/opencl/image_helper.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/replace_stl/stream.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace opencl {
-
-class SigmoidCompute
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  std::string doc() const override {
-    return "Sigmoid using cl::Buffer, kFloat";
-  }
-  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "buffer/sigmoid_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    size_t count = x_dims.production();
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    auto* x_buf = param.X->data<float, cl::Buffer>();
-    auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
-
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, (const int)count);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-
-    auto global_work_size = cl::NDRange{count};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    context.cl_wait_list()->emplace(out_buf, event_);
-  }
-
- private:
-  std::string kernel_func_name_{"sigmoid"};
-  std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-class SigmoidComputeFloatImageDefault
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFloat),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  std::string doc() const override {
-    return "Sigmoid using cl::Image2D(ImageDefault/RGBA), kFloat";
-  }
-
-  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/sigmoid_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    auto* x_buf = param.X->data<float, cl::Image2D>();
-    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_buf = param.Out->mutable_data<float, cl::Image2D>(
-        image_shape["width"], image_shape["height"]);
-    const auto& y_dims = param.Out->dims();  // useless: check dim only
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
-    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-            << image_shape["height"];
-    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
-            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
-    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
-            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
-
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
-                    static_cast<cl::size_type>(image_shape["height"])};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(out_buf, event_);
-    context.cl_context()->GetCommandQueue().finish();
-  }
-
- private:
-  std::string kernel_func_name_{"sigmoid"};
-  std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-class SigmoidComputeFP16ImageDefault
-    : public KernelLite<TARGET(kOpenCL),
-                        PRECISION(kFP16),
-                        DATALAYOUT(kImageDefault)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  std::string doc() const override {
-    return "Sigmoid using cl::Image2D(ImageDefault/RGBA), kFP16";
-  }
-
-  void PrepareForRun() override {
-    auto& context = ctx_->As<OpenCLContext>();
-    context.cl_context()->AddKernel(
-        kernel_func_name_, "image/sigmoid_kernel.cl", build_options_);
-  }
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    const auto& x_dims = param.X->dims();
-    auto* x_buf =
-        param.X->data<int16_t,
-                      cl::Image2D>();  // use int16_t represents half float
-    auto image_shape = InitImageDimInfoWith(x_dims);
-    auto* out_buf =
-        param.Out->mutable_data<int16_t, cl::Image2D>(  // use int16_t
-                                                        // represents half float
-            image_shape["width"],
-            image_shape["height"]);
-    const auto& y_dims = param.Out->dims();  // useless: check dim only
-
-    auto& context = ctx_->As<OpenCLContext>();
-    CHECK(context.cl_context() != nullptr);
-    STL::stringstream kernel_key;
-    kernel_key << kernel_func_name_ << build_options_;
-    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-
-    int arg_idx = 0;
-    cl_int status = kernel.setArg(arg_idx, *x_buf);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *out_buf);
-    CL_CHECK_FATAL(status);
-
-    VLOG(4) << TargetToStr(param.X->target());
-    VLOG(4) << TargetToStr(param.Out->target());
-    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
-            << image_shape["height"];
-    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
-            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
-    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
-            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
-
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
-                    static_cast<cl::size_type>(image_shape["height"])};
-    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
-        kernel,
-        cl::NullRange,
-        global_work_size,
-        cl::NullRange,
-        nullptr,
-        event_.get());
-    CL_CHECK_FATAL(status);
-    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    // context.cl_wait_list()->emplace(out_buf, event_);
-    context.cl_context()->GetCommandQueue().finish();
-  }
-
- private:
-  std::string kernel_func_name_{"sigmoid"};
-  std::string build_options_{"-DCL_DTYPE_half -DSIGMOID"};
-  std::shared_ptr<cl::Event> event_{new cl::Event};
-};
-
-}  // namespace opencl
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-// REGISTER_LITE_KERNEL(sigmoid,
-//                      kOpenCL,
-//                      kFloat,
-//                      kNCHW,
-//                      paddle::lite::kernels::opencl::SigmoidCompute,
-//                      def)
-//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-//     .Finalize();
-
-REGISTER_LITE_KERNEL(
-    sigmoid,
-    kOpenCL,
-    kFloat,
-    kImageDefault,
-    paddle::lite::kernels::opencl::SigmoidComputeFloatImageDefault,
-    ImageDefault)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(
-    sigmoid,
-    kOpenCL,
-    kFP16,
-    kImageDefault,
-    paddle::lite::kernels::opencl::SigmoidComputeFP16ImageDefault,
-    ImageDefault)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kOpenCL),
-                                       PRECISION(kFP16),
-                                       DATALAYOUT(kImageDefault))})
-    .Finalize();
diff --git a/lite/kernels/opencl/sigmoid_compute_test.cc b/lite/kernels/opencl/sigmoid_compute_test.cc
deleted file mode 100644
index 77bc03727c5d1e47d2771a04fd5673246d9550de..0000000000000000000000000000000000000000
--- a/lite/kernels/opencl/sigmoid_compute_test.cc
+++ /dev/null
@@ -1,426 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <math.h>
-#include <random>
-#include "lite/backends/opencl/target_wrapper.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
-#include "lite/kernels/opencl/image_helper.h"
-
-namespace paddle {
-namespace lite {
-
-template <typename dtype>
-void sigmoid_compute_ref(const dtype *x_data,
-                         const DDim &x_dim,
-                         dtype *out_data) {
-  for (int i = 0; i < x_dim.production(); ++i) {
-    out_data[i] = 1 / (1 + expf(-x_data[i]));
-  }
-}
-
-// buffer
-#if 0   // sigmoid_buffer
-TEST(opencl_sigmoid_buffer, compute) {
-  // prepare data
-  const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
-  lite::Tensor x, out;
-  x.Resize(x_dim);
-  out.Resize(x_dim);
-
-  auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-10, 10);
-  auto *mapped_x = static_cast<float *>(
-      TargetWrapperCL::Map(x_data, 0, sizeof(float) * x_dim.production()));
-  for (int i = 0; i < x_dim.production(); i++) {
-    mapped_x[i] = dist(engine);
-  }
-
-  // set param and kernel, then run
-  operators::ActivationParam param;
-  param.X = &x;
-  param.Out = &out;
-
-  std::unique_ptr<KernelContext> context(new KernelContext);
-  context->As<OpenCLContext>().InitOnce();
-  auto kernels = KernelRegistry::Global().Create(
-      "sigmoid", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-  auto kernel = std::move(kernels.front());
-  kernel->SetParam(param);
-  std::unique_ptr<KernelContext> sigmoid_context(new KernelContext);
-  context->As<OpenCLContext>().CopySharedTo(
-      &(sigmoid_context->As<OpenCLContext>()));
-  kernel->SetContext(std::move(sigmoid_context));
-
-  kernel->Launch();
-
-  auto *wait_list = context->As<OpenCLContext>().cl_wait_list();
-  auto *out_ptr = param.Out->data<float, cl::Buffer>();
-  auto it = wait_list->find(out_ptr);
-  if (it != wait_list->end()) {
-    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
-    auto &event = *(it->second);
-    event.wait();
-  } else {
-    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
-  }
-
-  // run compute ref and check
-  std::unique_ptr<float[]> out_ref(new float[x_dim.production()]);
-  sigmoid_compute_ref<float>(mapped_x, x_dim, out_ref.get());
-
-  auto *out_data = out.mutable_data<float, cl::Buffer>();
-  auto *mapped_out = static_cast<float *>(
-      TargetWrapperCL::Map(out_data, 0, sizeof(float) * x_dim.production()));
-  for (int i = 0; i < x_dim.production(); i++) {
-    EXPECT_NEAR(mapped_out[i], out_ref[i], 1e-6);
-  }
-  TargetWrapperCL::Unmap(out_data, mapped_out);
-  TargetWrapperCL::Unmap(x_data, mapped_x);
-}
-#endif  // sigmoid_buffer
-
-#define LOOP_TEST
-// #define PRINT_RESULT
-TEST(sigmoid_image2d_fp32, compute) {
-  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> sigmoid(img) -> "
-               "layout(img2buf) "
-               "-> host";
-#ifdef LOOP_TEST
-  for (int n = 1; n <= 9; n += 3) {
-    for (auto c : {1, 3, 9}) {
-      for (int h = 12; h <= 100; h += 13) {
-        for (int w = 12; w <= 100; w += 25) {
-#else
-  const int n = 3;
-  const int c = 9;
-  const int h = 51;
-  const int w = 11;
-#endif  // LOOP_TEST
-          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
-                    << h << " " << w << " ========";
-
-          // set layout kernels
-          auto buf_to_img_kernels =
-              KernelRegistry::Global().Create("layout",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kAny),
-                                              DATALAYOUT(kImageDefault));
-          auto img_to_buf_kernels = KernelRegistry::Global().Create(
-              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
-          auto sigmoid_img_kernels =
-              KernelRegistry::Global().Create("sigmoid",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kFloat),
-                                              DATALAYOUT(kImageDefault));
-          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(sigmoid_img_kernels.empty());
-          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
-          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
-          auto sigmoid_img_kernel = std::move(sigmoid_img_kernels.front());
-          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
-          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
-          LOG(INFO) << "get 3rd kernel: " << sigmoid_img_kernel->doc();
-
-          // set tensors about op param
-          LOG(INFO) << "set tensors about op param";
-          // layout(buf->img): x -> sigmoid_in
-          // sigmoid(img): sigmoid_in -> sigmoid_out
-          // layout(img->buf): sigmoid_out -> y
-          lite::Tensor x, y, sigmoid_in, sigmoid_out, y_ref;
-          operators::LayoutParam BufferToImageParam;
-          operators::LayoutParam ImageToBufferParam;
-          BufferToImageParam.x = &x;
-          BufferToImageParam.y = &sigmoid_in;
-          ImageToBufferParam.x = &sigmoid_out;
-          ImageToBufferParam.y = &y;
-          operators::ActivationParam SigmoidParam;
-          SigmoidParam.X = &sigmoid_in;
-          SigmoidParam.Out = &sigmoid_out;
-
-          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
-          x.Resize(x_dim);
-          y.Resize(x_dim);
-          sigmoid_in.Resize(x_dim);
-          sigmoid_out.Resize(x_dim);
-          y_ref.Resize(x_dim);
-          auto sigmoid_image2d_shape =
-              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
-
-          // initialize tensors
-          LOG(INFO) << "initialize tensors";
-          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
-          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
-              x_data, 0, sizeof(float) * x_dim.production()));
-          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
-              y_data, 0, sizeof(float) * x_dim.production()));
-          std::default_random_engine engine;
-          std::uniform_real_distribution<float> dist(-1, 1);
-          for (int i = 0; i < x_dim.production(); ++i) {
-            mapped_x[i] = static_cast<float>(dist(engine));
-          }
-          auto *sigmoid_in_data = sigmoid_in.mutable_data<float, cl::Image2D>(
-              sigmoid_image2d_shape["width"], sigmoid_image2d_shape["height"]);
-          auto *sigmoid_out_data = sigmoid_out.mutable_data<float, cl::Image2D>(
-              sigmoid_image2d_shape["width"], sigmoid_image2d_shape["height"]);
-
-          // set context and kernel args
-          LOG(INFO) << "set context and kernel args";
-          std::unique_ptr<KernelContext> context(new KernelContext);
-          context->As<OpenCLContext>().InitOnce();
-
-          buf_to_img_kernel->SetParam(BufferToImageParam);
-          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(buf_to_img_context->As<OpenCLContext>()));
-          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
-
-          img_to_buf_kernel->SetParam(ImageToBufferParam);
-          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(img_to_buf_context->As<OpenCLContext>()));
-          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
-
-          sigmoid_img_kernel->SetParam(SigmoidParam);
-          std::unique_ptr<KernelContext> sigmoid_img_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(sigmoid_img_context->As<OpenCLContext>()));
-          sigmoid_img_kernel->SetContext(std::move(sigmoid_img_context));
-
-          // run kernels
-          LOG(INFO) << "run kernel: buf_to_img_kernel";
-          buf_to_img_kernel->Launch();
-          LOG(INFO) << "run kernel: relu_img_kernel";
-          sigmoid_img_kernel->Launch();
-          LOG(INFO) << "run kernel: img_to_buf_kernel";
-          img_to_buf_kernel->Launch();
-
-          // compute ref cpu
-          sigmoid_compute_ref<float>(mapped_x, x_dim, y_data_ref);
-// result
-#ifdef PRINT_RESULT
-          LOG(INFO) << "---- print kernel result (input -> output) ----";
-          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
-            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
-                      << std::endl;
-          }
-#endif  // PRINT_RESULT
-
-          // check result: compare kernel output and cpu output(y_data_ref)
-          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
-            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
-            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
-              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
-                        << " / " << x_dim.production() << ", y_data_ref["
-                        << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
-                        << eidx << "]:" << mapped_y[eidx] << ", mapped_x["
-                        << eidx << "]: " << mapped_x[eidx];
-              break;
-            }
-          }
-
-          // free
-          LOG(INFO) << "free: unmap x, y";
-          TargetWrapperCL::Unmap(x_data, mapped_x);
-          TargetWrapperCL::Unmap(y_data, mapped_y);
-#ifdef LOOP_TEST
-        }  // w
-      }    // h
-    }      // c
-  }        // n
-#else
-// nothing to do.
-#endif
-}
-
-#define SIGMOID_FP16_LOOP_TEST
-// #define SIGMOID_FP16_PRINT_RESULT
-TEST(sigmoid_image2d_fp16, compute) {
-  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> sigmoid(img) -> "
-               "layout(img2buf) "
-               "-> host";
-
-#ifdef SIGMOID_FP16_LOOP_TEST
-  for (int n = 1; n <= 100; n += 33) {
-    for (auto c : {1, 3}) {
-      for (int h = 12; h <= 100; h += 13) {
-        for (int w = 12; w <= 100; w += 25) {
-#else
-  const int n = 1;
-  const int c = 2;
-  const int h = 3;
-  const int w = 4;
-#endif  // SIGMOID_FP16_LOOP_TEST
-
-          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
-                    << h << " " << w << " ========";
-          // set layout kernels
-          auto buf_to_img_kernels =
-              KernelRegistry::Global().Create("layout",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kAny),
-                                              DATALAYOUT(kImageDefault));
-          auto img_to_buf_kernels = KernelRegistry::Global().Create(
-              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
-          auto sigmoid_img_kernels =
-              KernelRegistry::Global().Create("sigmoid",
-                                              TARGET(kOpenCL),
-                                              PRECISION(kFP16),
-                                              DATALAYOUT(kImageDefault));
-          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(buf_to_img_kernels.empty());
-          ASSERT_FALSE(sigmoid_img_kernels.empty());
-
-          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
-          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
-          auto sigmoid_img_kernel = std::move(sigmoid_img_kernels.front());
-          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
-          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
-          LOG(INFO) << "get 3rd kernel: " << sigmoid_img_kernel->doc();
-
-          // set tensors about op param
-          LOG(INFO) << "set tensors about op param";
-          // layout(buf->img): x -> sigmoid_in
-          // sigmoid(img): sigmoid_in -> sigmoid_out
-          // layout(img->buf): sigmoid_out -> y
-          lite::Tensor x, y, sigmoid_in, sigmoid_out, y_ref;
-          operators::LayoutParam BufferToImageParam;
-          operators::LayoutParam ImageToBufferParam;
-          BufferToImageParam.x = &x;
-          BufferToImageParam.y = &sigmoid_in;
-          ImageToBufferParam.x = &sigmoid_out;
-          ImageToBufferParam.y = &y;
-          operators::ActivationParam SigmoidParam;
-          SigmoidParam.X = &sigmoid_in;
-          SigmoidParam.Out = &sigmoid_out;
-
-          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
-          x.Resize(x_dim);
-          y.Resize(x_dim);
-          sigmoid_in.Resize(x_dim);
-          sigmoid_out.Resize(x_dim);
-          y_ref.Resize(x_dim);
-          auto sigmoid_image2d_shape =
-              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
-
-          // initialize tensors
-          LOG(INFO) << "initialize tensors";
-          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
-          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
-              x_data, 0, sizeof(float) * x_dim.production()));
-          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
-              y_data, 0, sizeof(float) * x_dim.production()));
-          std::default_random_engine engine;
-          std::uniform_real_distribution<float> dist(-1, 1);
-          for (int i = 0; i < x_dim.production(); ++i) {
-            mapped_x[i] = static_cast<float>(dist(engine));
-          }
-          auto *sigmoid_in_data = sigmoid_in.mutable_data<int16_t, cl::Image2D>(
-              sigmoid_image2d_shape["width"], sigmoid_image2d_shape["height"]);
-          auto *sigmoid_out_data =
-              sigmoid_out.mutable_data<int16_t, cl::Image2D>(
-                  sigmoid_image2d_shape["width"],
-                  sigmoid_image2d_shape["height"]);
-
-          // set context and kernel args
-          LOG(INFO) << "set context and kernel args";
-          std::unique_ptr<KernelContext> context(new KernelContext);
-          context->As<OpenCLContext>().InitOnce();
-
-          buf_to_img_kernel->SetParam(BufferToImageParam);
-          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(buf_to_img_context->As<OpenCLContext>()));
-          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
-
-          img_to_buf_kernel->SetParam(ImageToBufferParam);
-          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(img_to_buf_context->As<OpenCLContext>()));
-          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
-
-          sigmoid_img_kernel->SetParam(SigmoidParam);
-          std::unique_ptr<KernelContext> sigmoid_img_context(new KernelContext);
-          context->As<OpenCLContext>().CopySharedTo(
-              &(sigmoid_img_context->As<OpenCLContext>()));
-          sigmoid_img_kernel->SetContext(std::move(sigmoid_img_context));
-
-          // run kernels
-          LOG(INFO) << "run kernel: buf_to_img_kernel";
-          buf_to_img_kernel->Launch();
-          LOG(INFO) << "run kernel: sigmoid_img_kernel";
-          sigmoid_img_kernel->Launch();
-          LOG(INFO) << "run kernel: img_to_buf_kernel";
-          img_to_buf_kernel->Launch();
-
-          // compute ref cpu
-          sigmoid_compute_ref<float>(mapped_x, x_dim, y_data_ref);
-// result
-#ifdef SIGMOID_FP16_PRINT_RESULT
-          LOG(INFO) << "---- print kernel result (input -> output) ----";
-          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
-            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
-                      << std::endl;
-          }
-#endif  // SIGMOID_FP16_PRINT_RESULT
-
-          // check result: compare kernel output and cpu output(y_data_ref)
-          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
-            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-3);
-            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-3) {
-              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
-                        << " / " << x_dim.production() << ", y_data_ref["
-                        << eidx << "]: " << y_data_ref[eidx] << ", mapped_y["
-                        << eidx << "]: " << mapped_y[eidx] << ", mapped_x["
-                        << eidx << "]: " << mapped_x[eidx];
-              break;
-            }
-          }
-
-          // free
-          LOG(INFO) << "free: unmap x, y";
-          TargetWrapperCL::Unmap(x_data, mapped_x);
-          TargetWrapperCL::Unmap(y_data, mapped_y);
-#ifdef SIGMOID_FP16_LOOP_TEST
-        }  // w
-      }    // h
-    }      // c
-  }        // n
-#else
-// nothing to do.
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
-
-// sigmoid buffer
-// USE_LITE_KERNEL(sigmoid, kOpenCL, kFloat, kNCHW, def);
-
-// sigmoid image2d fp32
-USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
-USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
-USE_LITE_KERNEL(sigmoid, kOpenCL, kFloat, kImageDefault, ImageDefault);
-
-// sigmoid image2d fp16
-USE_LITE_KERNEL(sigmoid, kOpenCL, kFP16, kImageDefault, ImageDefault);
diff --git a/lite/kernels/opencl/slice_image_compute.cc b/lite/kernels/opencl/slice_image_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..149ef35afe3d49ca8793769ee7ad366292462296
--- /dev/null
+++ b/lite/kernels/opencl/slice_image_compute.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+#include "lite/utils/string.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
+                                              PRECISION(kFP16),
+                                              DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::SliceParam;
+
+  std::string doc() const override { return "Slice using cl::Image2D, kFP16"; }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/slice_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    const auto& in_dims = param.X->dims();
+    auto* x_img = param.X->data<half_t, cl::Image2D>();
+    auto& out_dims = param.Out->dims();
+
+    std::vector<int> axes = param.axes;
+    std::vector<int32_t> starts = param.starts;
+    std::vector<int32_t> ends = param.ends;
+
+    if (axes.size() > 1 || axes[0] != 1) {
+      LOG(FATAL) << "opencl slice_image only support channel slice ";
+    }
+
+    int axis = axes[0];
+    int start = starts[0];
+    int end = ends[0];
+    int dim_w = in_dims[axis + 2];
+
+    auto out_image_shape = InitImageDimInfoWith(out_dims);
+    auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(
+        out_image_shape["width"], out_image_shape["height"]);
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    cl_int status;
+    int arg_idx = 0;
+    status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, start);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, end);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, dim_w);
+    CL_CHECK_FATAL(status);
+
+    const std::vector<size_t>& default_work_size =
+        DefaultWorkSize(out_dims,
+                        DDim(std::vector<DDim::value_type>{
+                            static_cast<int64_t>(out_image_shape["width"]),
+                            static_cast<int64_t>(out_image_shape["height"])}));
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(default_work_size.data()[0]),
+                    static_cast<cl::size_type>(default_work_size.data()[1]),
+                    static_cast<cl::size_type>(default_work_size.data()[2])};
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"slice"};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(slice,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::SliceComputeImage2D,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/slice_image_compute_test.cc b/lite/kernels/opencl/slice_image_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a931c3b9f4aacb13089c507a7fa023e7808fb196
--- /dev/null
+++ b/lite/kernels/opencl/slice_image_compute_test.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <memory>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+
+#define FP16_MAX_DIFF (5e-1)
+
+namespace paddle {
+namespace lite {
+
+void slice_channel(const float* input_data,
+                   const DDim& in_dim,
+                   float* output_data,
+                   const int start,
+                   const int end) {
+  int n = in_dim[0];
+  int in_n_stride = 1;
+  for (int i = 1; i < in_dim.size(); ++i) {
+    in_n_stride *= in_dim[i];
+  }
+  int in_c_stride = in_n_stride / in_dim[1];
+  int mini_batch = end - start;
+  for (int ni = 0; ni < n; ++ni) {
+    const float* in_n = input_data + ni * in_n_stride + start * in_c_stride;
+    float* out_n = output_data + ni * mini_batch * in_c_stride;
+    memcpy(out_n, in_n, sizeof(float) * mini_batch * in_c_stride);
+  }
+}
+
+TEST(slice_image2d_fp16, compute) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "slice", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+
+  auto kernel = std::move(kernels.front());
+
+  LOG(INFO) << "get kernel:" << kernel->doc();
+
+  lite::Tensor x, out;
+  operators::SliceParam param;
+  param.X = &x;
+  param.Out = &out;
+  param.axes = std::vector<int>({1});
+  param.starts = std::vector<int32_t>({2});
+  param.ends = std::vector<int32_t>({5});
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> slice_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(slice_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(slice_context));
+
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{3, 11, 107, 218});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{3, 3, 107, 218});
+  x.Resize(in_dim);
+  out.Resize(out_dim);
+
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-5, 5);
+  std::vector<float> input_v(3 * 11 * 107 * 218);
+  for (auto& i : input_v) {
+    i = dist(engine);
+  }
+
+  LOG(INFO) << "prepare input";
+  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  DDim image_shape = default_converter->InitImageDimInfoWith(in_dim);
+  LOG(INFO) << "image_shape = " << image_shape[0] << " " << image_shape[1];
+  std::vector<half_t> x_image_data(image_shape.production() * 4);  // 4 : RGBA
+  default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
+  auto* x_image = x.mutable_data<half_t, cl::Image2D>(
+      image_shape[0], image_shape[1], x_image_data.data());
+  LOG(INFO) << "x_image:" << x_image;
+
+  auto* out_image =
+      out.mutable_data<half_t, cl::Image2D>(image_shape[0], image_shape[1]);
+  LOG(INFO) << "out_image:" << out_image;
+  kernel->Launch();
+
+  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto* out_ptr = param.Out->data<half_t, cl::Image2D>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto& event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
+  slice_channel(input_v.data(), in_dim, out_ref.get(), 2, 5);
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+  half_t* out_image_data = new half_t[image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              out_image,
+                              image_shape[0],
+                              image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  float* out_data = new float[image_shape.production() * 4];
+  default_converter->ImageToNCHW(
+      out_image_data, out_data, image_shape, out_dim);
+
+  for (int i = 0; i < out_dim.production(); i++) {
+    auto abs_diff = abs(out_data[i] - out_ref[i]);
+    auto relative_diff = COMPUTE_RELATIVE_DIFF(out_data[i], out_ref[i]);
+    EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+              true);
+    if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
+      LOG(ERROR) << "error idx:" << i << " out_data[" << i
+                 << "]:" << out_data[i] << " "
+                                           "out_ref["
+                 << i << "]:" << out_ref[i] << " abs_diff:" << abs_diff
+                 << " relative_diff:" << relative_diff
+                 << " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(slice, kOpenCL, kFP16, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/test_helper.h b/lite/kernels/opencl/test_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1b875688e1ade3aa3fb441506d2a11c5a06ab19
--- /dev/null
+++ b/lite/kernels/opencl/test_helper.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+
+#pragma once
+
+#define COMPUTE_ABS_DIFF(res0, res1) abs(res0 - res1)
+
+#define COMPUTE_RELATIVE_DIFF(res0, res1) abs(abs(res0 - res1) / (res1 + 1e-5))
+
+#define IS_DIFF_PASSED(res0, res1, threshold)        \
+  (((COMPTUE_ABS_DIFF(res0, res1) < threshold) ||    \
+    (COMPUTE_RELATIVE_DIFF(res0, res1) < threshold)) \
+       ? true                                        \
+       : false)
diff --git a/lite/kernels/x86/fill_constant_batch_size_like_compute.h b/lite/kernels/x86/fill_constant_batch_size_like_compute.h
index 411a114e3f3ec82775c60f5f9a0642aae606eeda..8d49b0816d85f30351a4ded81e0f6ef650b6c445 100644
--- a/lite/kernels/x86/fill_constant_batch_size_like_compute.h
+++ b/lite/kernels/x86/fill_constant_batch_size_like_compute.h
@@ -34,8 +34,8 @@ class FillConstantBatchSizeLikeCompute
   void Run() override {
     auto& param = *param_.get_mutable<param_t>();
     auto& ctx = ctx_->As<X86Context>();
-    auto* out = param.Out;
-    auto* in = param.Input;
+    auto* out = param.out;
+    auto* in = param.input;
     if (in->lod().size() && param.input_dim_idx == 0) {
       // set the correct batch size for the LoDTensor.
       auto odims = out->dims();
diff --git a/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc b/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
index b2504e19e149fe8494df53ab22584bebcb295c4f..e324b599836059ca3560593950f689eabd393ea0 100644
--- a/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
+++ b/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
@@ -56,8 +56,8 @@ TEST(fill_constant_batch_size_like_x86, run_test) {
 
   FillConstantBatchSizeLikeCompute<float> fill_constant_batch_size_like;
   operators::FillConstantBatchSizeLikeParam param;
-  param.Input = &input;
-  param.Out = &out;
+  param.input = &input;
+  param.out = &out;
   std::vector<int> shape{-1, 132, 7};
   float value = 3.5;
   param.shape = shape;
diff --git a/lite/kernels/xpu/bridges/act_op.cc b/lite/kernels/xpu/bridges/act_op.cc
index e3d4588aa2aed1268a8e15f654019031a5202542..502d189b8959a61ab8dd215b9a6416bb8f3b115e 100644
--- a/lite/kernels/xpu/bridges/act_op.cc
+++ b/lite/kernels/xpu/bridges/act_op.cc
@@ -32,15 +32,9 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
 
   // X node
   std::shared_ptr<Node> x_node = nullptr;
diff --git a/lite/kernels/xpu/bridges/batch_norm_op.cc b/lite/kernels/xpu/bridges/batch_norm_op.cc
index d84b9cc4f190432166575cd689e839af0d0e0b12..6f909ed91ccf72fc98d6e7433fcbd2631a7675f8 100644
--- a/lite/kernels/xpu/bridges/batch_norm_op.cc
+++ b/lite/kernels/xpu/bridges/batch_norm_op.cc
@@ -32,40 +32,22 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
 
   auto scale_name = op_info->Input("Scale").front();
-  auto scale_type = kernel->GetInputDeclType("Scale");
-  CHECK(scale_type->precision() == PRECISION(kFloat));
-  CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
   auto scale = scope->FindMutableTensor(scale_name);
 
   auto bias_name = op_info->Input("Bias").front();
-  auto bias_type = kernel->GetInputDeclType("Bias");
-  CHECK(bias_type->precision() == PRECISION(kFloat));
-  CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
   auto bias = scope->FindMutableTensor(bias_name);
 
   auto mean_name = op_info->Input("Mean").front();
-  auto mean_type = kernel->GetInputDeclType("Mean");
-  CHECK(mean_type->precision() == PRECISION(kFloat));
-  CHECK(mean_type->layout() == DATALAYOUT(kNCHW));
   auto mean = scope->FindMutableTensor(mean_name);
 
   auto variance_name = op_info->Input("Variance").front();
-  auto variance_type = kernel->GetInputDeclType("Variance");
-  CHECK(variance_type->precision() == PRECISION(kFloat));
-  CHECK(variance_type->layout() == DATALAYOUT(kNCHW));
   auto variance = scope->FindMutableTensor(variance_name);
 
   auto y_name = op_info->Output("Y").front();
-  auto y_type = kernel->GetOutputDeclType("Y");
-  CHECK(y_type->precision() == PRECISION(kFloat));
-  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
 
   auto epsilon = op_info->GetAttr<float>("epsilon");
 
diff --git a/lite/kernels/xpu/bridges/conv_op.cc b/lite/kernels/xpu/bridges/conv_op.cc
index fe9c598847977e87d87950c3850d3e1d074958b2..bff96ce288fe807225f8e57f94594d7eb6f7eb9e 100644
--- a/lite/kernels/xpu/bridges/conv_op.cc
+++ b/lite/kernels/xpu/bridges/conv_op.cc
@@ -33,21 +33,12 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto input_name = op_info->Input("Input").front();
-  auto input_type = kernel->GetInputDeclType("Input");
-  CHECK(input_type->precision() == PRECISION(kFloat));
-  CHECK(input_type->layout() == DATALAYOUT(kNCHW));
   auto input = scope->FindMutableTensor(input_name);
   auto input_dims = input->dims();
   auto filter_name = op_info->Input("Filter").front();
-  auto filter_type = kernel->GetInputDeclType("Filter");
-  CHECK(filter_type->precision() == PRECISION(kFloat));
-  CHECK(filter_type->layout() == DATALAYOUT(kNCHW));
   auto filter = scope->FindMutableTensor(filter_name);
   auto filter_dims = filter->dims();
   auto output_name = op_info->Output("Output").front();
-  auto output_type = kernel->GetOutputDeclType("Output");
-  CHECK(output_type->precision() == PRECISION(kFloat));
-  CHECK(output_type->layout() == DATALAYOUT(kNCHW));
   auto bs = input_dims[0];
   auto oc = filter_dims[0];
   CHECK_EQ(input_dims.size(), 4);
@@ -125,9 +116,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   // 2: {n, oc, oh, ow}
   if (HasInputArg(op_info, scope, "Bias")) {
     auto bias_name = op_info->Input("Bias").front();
-    auto bias_type = kernel->GetInputDeclType("Bias");
-    CHECK(bias_type->precision() == PRECISION(kFloat));
-    CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
     auto bias = scope->FindMutableTensor(bias_name);
     auto bias_dims = bias->dims();
     auto bias_data_size = bias_dims.production();
diff --git a/lite/kernels/xpu/bridges/dropout_op.cc b/lite/kernels/xpu/bridges/dropout_op.cc
index df869e17ff5626f03d6eb988a1687bb51c75d440..562dad1a13a19911f70989fc0ac5142bac705edf 100644
--- a/lite/kernels/xpu/bridges/dropout_op.cc
+++ b/lite/kernels/xpu/bridges/dropout_op.cc
@@ -32,15 +32,9 @@ int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto dropout_prob = op_info->GetAttr<float>("dropout_prob");
   auto dropout_implementation =
       op_info->GetAttr<std::string>("dropout_implementation");
diff --git a/lite/kernels/xpu/bridges/elementwise_ops.cc b/lite/kernels/xpu/bridges/elementwise_ops.cc
index 7fcae312b9776afa7e3b1cbd1bd17bd25b2e4aab..31266a09cc7e2482f22d8e3437b1e93690f37d2a 100644
--- a/lite/kernels/xpu/bridges/elementwise_ops.cc
+++ b/lite/kernels/xpu/bridges/elementwise_ops.cc
@@ -32,21 +32,12 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto y_name = op_info->Input("Y").front();
-  auto y_type = kernel->GetInputDeclType("Y");
-  CHECK(y_type->precision() == PRECISION(kFloat));
-  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
   auto y = scope->FindMutableTensor(y_name);
   auto y_dims = y->dims();
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto axis = op_info->GetAttr<int>("axis");
 
   // X node
diff --git a/lite/kernels/xpu/bridges/gather_op.cc b/lite/kernels/xpu/bridges/gather_op.cc
index 845bbb8d98f5734b855178fd68880c5c901608bc..4a5ebdaf2ca7d93916cece8f04e48b50fbe1cc4d 100644
--- a/lite/kernels/xpu/bridges/gather_op.cc
+++ b/lite/kernels/xpu/bridges/gather_op.cc
@@ -32,24 +32,14 @@ int GatherConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto index_name = op_info->Input("Index").front();
-  auto index_type = kernel->GetInputDeclType("Index");
-  CHECK(index_type->precision() == PRECISION(kInt32) ||
-        index_type->precision() == PRECISION(kInt64));
-  CHECK(index_type->layout() == DATALAYOUT(kNCHW));
   auto index = scope->FindMutableTensor(index_name);
   auto index_dims = index->dims();
   CHECK(index_dims.size() == 1 ||
         (index_dims.size() == 2 && index_dims[1] == 1));
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto out = scope->FindMutableTensor(out_name);
   auto out_dims = out->dims();
 
diff --git a/lite/kernels/xpu/bridges/graph.cc b/lite/kernels/xpu/bridges/graph.cc
index 43aaad3402b7873dbaa67d4c4897b5378e098500..4af8a2bd3464efaaec6937996445736068f0f656 100644
--- a/lite/kernels/xpu/bridges/graph.cc
+++ b/lite/kernels/xpu/bridges/graph.cc
@@ -49,7 +49,7 @@ std::shared_ptr<Node> Graph::Add(const std::string& name,
   CHECK_GE(idx, 1);
   node->set_data(std::make_shared<xtcl::xExpr>(layer));
   // Generate a unique name for the current XTCL layer
-  builder_.SetLayer(name + "__" + std::to_string(idx));
+  builder_.SetLayer(name + "__" + paddle::lite::to_string(idx));
   return node;
 }
 
diff --git a/lite/kernels/xpu/bridges/layer_norm_op.cc b/lite/kernels/xpu/bridges/layer_norm_op.cc
index 3ad190b73f59d7f1decf01c52d24799550daaea8..140fa3f5b8b55fcb396ea440b01e7cb1d6f348c2 100644
--- a/lite/kernels/xpu/bridges/layer_norm_op.cc
+++ b/lite/kernels/xpu/bridges/layer_norm_op.cc
@@ -32,15 +32,9 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto y_name = op_info->Output("Y").front();
-  auto y_type = kernel->GetOutputDeclType("Y");
-  CHECK(y_type->precision() == PRECISION(kFloat));
-  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
   auto y = scope->FindMutableTensor(y_name);
   auto y_dims = y->dims();
   auto epsilon = op_info->GetAttr<float>("epsilon");
@@ -70,9 +64,6 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   std::shared_ptr<Node> scale_node = nullptr;
   if (HasInputArg(op_info, scope, "Scale")) {
     auto scale_name = op_info->Input("Scale").front();
-    auto scale_type = kernel->GetInputDeclType("Scale");
-    CHECK(scale_type->precision() == PRECISION(kFloat));
-    CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
     auto scale = scope->FindMutableTensor(scale_name);
     auto scale_dims = scale->dims();
     CHECK_EQ(scale_dims.size(), 1);
@@ -86,9 +77,6 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   std::shared_ptr<Node> bias_node = nullptr;
   if (HasInputArg(op_info, scope, "Bias")) {
     auto bias_name = op_info->Input("Bias").front();
-    auto bias_type = kernel->GetInputDeclType("Bias");
-    CHECK(bias_type->precision() == PRECISION(kFloat));
-    CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
     auto bias = scope->FindMutableTensor(bias_name);
     auto bias_dims = bias->dims();
     CHECK_EQ(bias_dims.size(), 1);
diff --git a/lite/kernels/xpu/bridges/lookup_table_op.cc b/lite/kernels/xpu/bridges/lookup_table_op.cc
index eecf50b5bd601e912483adb39154a7430bc05c9e..fa480062ce64205e892ba8cc9dc5d84f10cfa8e3 100644
--- a/lite/kernels/xpu/bridges/lookup_table_op.cc
+++ b/lite/kernels/xpu/bridges/lookup_table_op.cc
@@ -32,22 +32,13 @@ int LookupTableConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto ids_name = op_info->Input("Ids").front();
-  auto ids_type = kernel->GetInputDeclType("Ids");
-  CHECK(ids_type->precision() == PRECISION(kInt64));
-  CHECK(ids_type->layout() == DATALAYOUT(kNCHW));
   auto ids = scope->FindMutableTensor(ids_name);
   auto ids_dims = ids->dims();
   auto w_name = op_info->Input("W").front();
-  auto w_type = kernel->GetInputDeclType("W");
-  CHECK(w_type->precision() == PRECISION(kFloat));
-  CHECK(w_type->layout() == DATALAYOUT(kNCHW));
   auto w = scope->FindMutableTensor(w_name);
   auto w_dims = w->dims();
   CHECK_EQ(w_dims.size(), 2);
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto out = scope->FindMutableTensor(out_name);
   auto out_dims = out->dims();
   auto padding_idx = op_info->GetAttr<int64_t>("padding_idx");
diff --git a/lite/kernels/xpu/bridges/matmul_op.cc b/lite/kernels/xpu/bridges/matmul_op.cc
index c17ba8423c04eddf8b042c95e959d8b703c60c7a..cb418f5266b1853dc9bb4b81f04a11e15884c64f 100644
--- a/lite/kernels/xpu/bridges/matmul_op.cc
+++ b/lite/kernels/xpu/bridges/matmul_op.cc
@@ -32,23 +32,14 @@ int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
 
   auto y_name = op_info->Input("Y").front();
-  auto y_type = kernel->GetInputDeclType("Y");
-  CHECK(y_type->precision() == PRECISION(kFloat));
-  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
   auto y = scope->FindMutableTensor(y_name);
   auto y_dims = y->dims();
 
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto out = scope->FindMutableTensor(out_name);
   auto out_dims = out->dims();
 
diff --git a/lite/kernels/xpu/bridges/mul_op.cc b/lite/kernels/xpu/bridges/mul_op.cc
index e12f767d13e4c1e01b671f5a4f7ba712dd8a1ef5..7ff19fc3c88e6644c1af6be703e9dc3a3f3619f2 100644
--- a/lite/kernels/xpu/bridges/mul_op.cc
+++ b/lite/kernels/xpu/bridges/mul_op.cc
@@ -32,21 +32,12 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto y_name = op_info->Input("Y").front();
-  auto y_type = kernel->GetInputDeclType("Y");
-  CHECK(y_type->precision() == PRECISION(kFloat));
-  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
   auto y = scope->FindMutableTensor(y_name);
   auto y_dims = y->dims();
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto out = scope->FindMutableTensor(out_name);
   auto out_dims = out->dims();
   auto x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
diff --git a/lite/kernels/xpu/bridges/pool_op.cc b/lite/kernels/xpu/bridges/pool_op.cc
index 90653edcce26dd7da5ca0848368a98ea87a04c0d..4909cef30ff4d3d94d1e4ca8047b3882def23028 100644
--- a/lite/kernels/xpu/bridges/pool_op.cc
+++ b/lite/kernels/xpu/bridges/pool_op.cc
@@ -32,15 +32,9 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input, and attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
   auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
   auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
diff --git a/lite/kernels/xpu/bridges/reshape_op.cc b/lite/kernels/xpu/bridges/reshape_op.cc
index 5e9a37d18e742e2843da1801cccc60e9202ccbcf..1da32bb9dec7d499ea096df1fae8f02f7b53f59c 100644
--- a/lite/kernels/xpu/bridges/reshape_op.cc
+++ b/lite/kernels/xpu/bridges/reshape_op.cc
@@ -48,9 +48,6 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   std::vector<int> shape;
   if (HasInputArg(op_info, scope, "ShapeTensor")) {
     auto shape_tensor_names = op_info->Input("ShapeTensor");
-    // auto shape_tensor_type = kernel->GetInputDeclType("ShapeTensor");
-    // CHECK(shape_tensor_type->precision() == PRECISION(kInt32));
-    // CHECK(shape_tensor_type->layout() == DATALAYOUT(kNCHW));
     for (auto shape_tensor_name : shape_tensor_names) {
       auto shape_tensor = scope->FindMutableTensor(shape_tensor_name);
       CHECK(shape_tensor->persistable());
@@ -64,9 +61,6 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
         << shape.size();
   } else if (HasInputArg(op_info, scope, "Shape")) {
     auto actual_shape_name = op_info->Input("Shape").front();
-    // auto actual_shape_type = kernel->GetInputDeclType("Shape");
-    // CHECK(actual_shape_type->precision() == PRECISION(kInt32));
-    // CHECK(actual_shape_type->layout() == DATALAYOUT(kNCHW));
     auto actual_shape = scope->FindMutableTensor(actual_shape_name);
     CHECK(actual_shape->persistable());
     auto actual_shape_dims = actual_shape->dims();
diff --git a/lite/kernels/xpu/bridges/scale_op.cc b/lite/kernels/xpu/bridges/scale_op.cc
index e6871390ac2690fa2e439ae56e59e49f342777e4..b6379bfbaa30bd027efc71bc32bef9c0cf5bbf4d 100644
--- a/lite/kernels/xpu/bridges/scale_op.cc
+++ b/lite/kernels/xpu/bridges/scale_op.cc
@@ -32,15 +32,9 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   float scale = op_info->GetAttr<float>("scale");
   bool bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
   float bias = op_info->GetAttr<float>("bias");
diff --git a/lite/kernels/xpu/bridges/slice_op.cc b/lite/kernels/xpu/bridges/slice_op.cc
index 3e4592d454ae9b79a51606ed9108c0ef17878276..8af5e87405d56f39b1b5d205e110b41292a8fde7 100644
--- a/lite/kernels/xpu/bridges/slice_op.cc
+++ b/lite/kernels/xpu/bridges/slice_op.cc
@@ -32,15 +32,9 @@ int SliceConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input, output and op attributes
   auto input_name = op_info->Input("Input").front();
-  auto input_type = kernel->GetInputDeclType("Input");
-  CHECK(input_type->precision() == PRECISION(kFloat));
-  CHECK(input_type->layout() == DATALAYOUT(kNCHW));
   auto input = scope->FindMutableTensor(input_name);
   auto input_dims = input->dims();
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto axes = op_info->GetAttr<std::vector<int>>("axes");
   auto starts = op_info->GetAttr<std::vector<int>>("starts");
   auto ends = op_info->GetAttr<std::vector<int>>("ends");
diff --git a/lite/kernels/xpu/bridges/softmax_op.cc b/lite/kernels/xpu/bridges/softmax_op.cc
index 740764015082a4c21bdef443e76e90065b2a99cb..86c8469387b7b96f46e5c266c33b4b4fa3655d21 100644
--- a/lite/kernels/xpu/bridges/softmax_op.cc
+++ b/lite/kernels/xpu/bridges/softmax_op.cc
@@ -32,15 +32,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   int axis = op_info->HasAttr("axis") ? op_info->GetAttr<int>("axis") : -1;
 
   // X node
diff --git a/lite/kernels/xpu/bridges/stack_op.cc b/lite/kernels/xpu/bridges/stack_op.cc
index 69673aaebaf0a112fe5b1339b6e253a3c3a0334b..5c028489083bf04248d32e687357bd6dad1ca3fa 100644
--- a/lite/kernels/xpu/bridges/stack_op.cc
+++ b/lite/kernels/xpu/bridges/stack_op.cc
@@ -32,13 +32,7 @@ int StackConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_names = op_info->Input("X");
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto y_name = op_info->Output("Y").front();
-  auto y_type = kernel->GetOutputDeclType("Y");
-  CHECK(y_type->precision() == PRECISION(kFloat));
-  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
   int axis = op_info->GetAttr<int>("axis");
 
   // X nodes
diff --git a/lite/kernels/xpu/bridges/transpose_op.cc b/lite/kernels/xpu/bridges/transpose_op.cc
index 4217fe0119be8584f0ca83408dca92100e652076..a563d24086c50392603cba03cbdb6b5c54f86126 100644
--- a/lite/kernels/xpu/bridges/transpose_op.cc
+++ b/lite/kernels/xpu/bridges/transpose_op.cc
@@ -32,15 +32,9 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto axis = op_info->GetAttr<std::vector<int>>("axis");
 
   // X node
diff --git a/lite/kernels/xpu/subgraph_compute.cc b/lite/kernels/xpu/subgraph_compute.cc
index c4d170f67351a473ee9d306999e95eea1dd9ea25..1b6d374f7396cf1e4e91bfe786603005fb0ff8dc 100644
--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
@@ -220,10 +220,12 @@ void SubgraphCompute::Run() {
 
 REGISTER_LITE_KERNEL(subgraph,
                      kXPU,
-                     kFloat,
+                     kAny,
                      kNCHW,
                      paddle::lite::kernels::xpu::SubgraphCompute,
                      def)
-    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Inputs",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindOutput("Outputs",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/xpu/subgraph_compute.h b/lite/kernels/xpu/subgraph_compute.h
index c21a1b7b054fd642f330ee95bff972f581e65c6b..1faada3978a2ab33fbe0135d57f21a94c97d5c61 100644
--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
@@ -49,7 +49,7 @@ class SubgraphEngine : public subgraph::Engine {
   std::unique_ptr<xtcl::network::xRuntimeInstance> device_program_{nullptr};
 };
 
-class SubgraphCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+class SubgraphCompute : public KernelLite<TARGET(kXPU), PRECISION(kAny)> {
  public:
   using param_t = operators::SubgraphParam;
 
diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc
index 42d132b418c9bf806d35ad2d8f302b190ce660e2..08e6a303094dc42278bfcb24c54f16bd3819d5c1 100644
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
@@ -796,7 +796,7 @@ void LoadModelNaiveFromFile(const std::string &filename,
   const uint64_t opt_version_length = 16 * sizeof(char);
   ReadModelDataFromFile<char>(
       opt_version, prog_path, &offset, opt_version_length);
-  VLOG(4) << "Opt_version:" << opt_version;
+  VLOG(4) << "Opt_version:" << static_cast<const char *>(opt_version);
 
   // check version, opt's version should be consistent with current Paddle-Lite
   // version.
@@ -806,7 +806,7 @@ void LoadModelNaiveFromFile(const std::string &filename,
     LOG(WARNING) << "warning: the version of opt that transformed this model "
                     "is not consistent with current Paddle-Lite version."
                     "\n      version of opt:"
-                 << opt_version
+                 << static_cast<const char *>(opt_version)
                  << "\n      version of current Paddle-Lite:" << paddle_version;
   }
 
@@ -893,7 +893,7 @@ void LoadModelNaiveFromMemory(const std::string &model_buffer,
   const uint64_t paddle_version_length = 16 * sizeof(char);
   ReadModelDataFromBuffer<char>(
       opt_version, model_buffer, &offset, paddle_version_length);
-  VLOG(4) << "Opt_version:" << opt_version;
+  VLOG(4) << "Opt_version:" << static_cast<const char *>(opt_version);
 
   // (3)get topo_size and topo_data
   uint64_t topo_size;
diff --git a/lite/model_parser/naive_buffer/naive_buffer_test.cc b/lite/model_parser/naive_buffer/naive_buffer_test.cc
index 8b6ffb4dcf481bbb8df92e7e15c1d569d575bcae..98789e8006817fceb4745bffd0c095da7ad360fc 100644
--- a/lite/model_parser/naive_buffer/naive_buffer_test.cc
+++ b/lite/model_parser/naive_buffer/naive_buffer_test.cc
@@ -155,7 +155,7 @@ TEST(ListBuilder, basic) {
 
   for (int i = 0; i < num_elems; i++) {
     auto* elem = li.New();
-    elem->set("elem-" + std::to_string(i));
+    elem->set("elem-" + paddle::lite::to_string(i));
   }
   li.Save();
   table.SaveToFile("2.bf");
@@ -169,7 +169,7 @@ TEST(ListBuilder, basic) {
   li1.Load();
 
   for (int i = 0; i < num_elems; i++) {
-    ASSERT_EQ(li1.Get(i).data(), "elem-" + std::to_string(i));
+    ASSERT_EQ(li1.Get(i).data(), "elem-" + paddle::lite::to_string(i));
   }
 }
 
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index ccc9c825db8a8a5030c6481ee0e33b8f324f4d11..48e27560317c089446e8dbc5040786f34ca962c4 100644
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -18,8 +18,8 @@ add_operator(activation_ops basic SRCS activation_ops.cc DEPS ${op_DEPS})
 add_operator(elementwise_ops basic SRCS elementwise_ops.cc DEPS ${op_DEPS})
 add_operator(box_coder_op_lite basic SRCS box_coder_op.cc DEPS ${op_DEPS})
 add_operator(multiclass_nms_op_lite basic SRCS multiclass_nms_op.cc DEPS ${op_DEPS})
-add_operator(mean_op basic SRCS mean_op.cc DEPS ${op_DEPS})
 add_operator(fill_constant_op basic SRCS fill_constant_op.cc DEPS ${op_DEPS})
+add_operator(fill_constant_batch_size_like_op basic SRCS fill_constant_batch_size_like_op.cc DEPS ${op_DEPS})
 add_operator(shuffle_channel_op basic SRCS shuffle_channel_op.cc DEPS ${op_DEPS})
 add_operator(yolo_box_op basic SRCS yolo_box_op.cc DEPS ${op_DEPS})
 add_operator(interpolate_op basic SRCS interpolate_op.cc DEPS ${op_DEPS})
@@ -82,14 +82,15 @@ add_operator(box_clip_op extra SRCS box_clip_op.cc DEPS ${op_DEPS})
 add_operator(fake_quantize_range_abs_max_op basic SRCS fake_quantize_range_abs_max.cc DEPS ${op_DEPS})
 add_operator(sequence_expand_as_op_lite extra SRCS sequence_expand_as_op.cc DEPS ${op_DEPS})
 add_operator(assign_value_op extra SRCS assign_value_op.cc DEPS ${op_DEPS})
-add_operator(fake_quantize_dequantize_moving_avg_abs_max_op extra SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
-add_operator(fake_channel_wise_dequantize_max_abs_op extra SRCS fake_channel_wise_dequantize_max_abs.cc DEPS ${op_DEPS})
+add_operator(fake_quantize_dequantize_moving_avg_abs_max_op basic SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
+add_operator(fake_channel_wise_dequantize_max_abs_op basic SRCS fake_channel_wise_dequantize_max_abs.cc DEPS ${op_DEPS})
 add_operator(split_lod_tensor_op_lite extra SRCS split_lod_tensor_op.cc DEPS ${op_DEPS})
 add_operator(merge_lod_tensor_op_lite extra SRCS merge_lod_tensor_op.cc DEPS ${op_DEPS})
 add_operator(reduce_prod_op_lite extra SRCS reduce_prod_op.cc DEPS ${op_DEPS})
 add_operator(sequence_reshape_op_lite extra SRCS sequence_reshape_op.cc DEPS ${op_DEPS})
 add_operator(sequence_reverse_op_lite extra SRCS sequence_reverse_op.cc DEPS ${op_DEPS})
 add_operator(sequence_pool extra SRCS sequence_pool_op.cc DEPS ${op_DEPS})
+add_operator(sequence_conv extra SRCS sequence_conv_op.cc DEPS ${op_DEPS})
 add_operator(sequence_pool_concat extra SRCS sequence_pool_concat_op.cc DEPS ${op_DEPS})
 add_operator(reduce_sum_op_lite extra SRCS reduce_ops.cc DEPS ${op_DEPS})
 add_operator(match_matrix_tensor_op_lite extra SRCS match_matrix_tensor_op.cc DEPS ${op_DEPS})
@@ -103,10 +104,12 @@ add_operator(sequence_arithmetic_op_lite extra SRCS sequence_arithmetic_op.cc DE
 add_operator(conditional_block_op_lite extra SRCS conditional_block_op.cc DEPS ${op_DEPS})
 add_operator(collect_fpn_proposals_op_lite extra SRCS collect_fpn_proposals_op.cc DEPS ${op_DEPS})
 add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposals_op.cc DEPS ${op_DEPS})
+add_operator(crf_decoding_op_lite extra SRCS crf_decoding_op.cc DEPS ${op_DEPS})
 
 # for OCR specific
 add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
 add_operator(lookup_table_op extra SRCS lookup_table_op.cc DEPS ${op_DEPS})
+add_operator(lookup_table_dequant_op extra SRCS lookup_table_dequant_op.cc DEPS ${op_DEPS})
 add_operator(lookup_table_v2_op extra SRCS lookup_table_v2_op.cc DEPS ${op_DEPS})
 add_operator(beam_search_decode_op extra SRCS beam_search_decode_op.cc DEPS ${op_DEPS})
 add_operator(logical_xor  extra SRCS logical_op.cc DEPS ${op_DEPS})
@@ -134,6 +137,17 @@ add_operator(search_aligned_mat_mul_op extra SRCS search_aligned_mat_mul_op.cc D
 add_operator(search_seq_fc_op extra SRCS search_seq_fc_op.cc DEPS ${op_DEPS})
 add_operator(sequence_topk_avg_pooling_op basic SRCS sequence_topk_avg_pooling_op.cc DEPS ${op_DEPS})
 add_operator(search_fc_op basic SRCS search_fc_op.cc DEPS ${op_DEPS})
+add_operator(lstm_op extra SRCS lstm_op.cc DEPS ${op_DEPS})
+
+# 4. training op
+add_operator(mean_op extra SRCS mean_op.cc DEPS ${op_DEPS})
+if (LITE_WITH_TRAIN)
+  add_operator(mean_grad_op extra SRCS mean_grad_op.cc DEPS ${op_DEPS})
+  add_operator(activation_grad_ops basic SRCS activation_grad_ops.cc DEPS ${op_DEPS})
+  add_operator(elementwise_grad_op extra SRCS elementwise_grad_ops.cc DEPS ${op_DEPS})
+  add_operator(mul_grad_op basic SRCS mul_grad_op.cc DEPS ${op_DEPS})
+  add_operator(sgd_op extra SRCS sgd_op.cc DEPS ${op_DEPS})
+endif()
 
 if (NOT LITE_WITH_X86)
     lite_cc_test(test_fc_op SRCS fc_op_test.cc
diff --git a/lite/operators/activation_grad_ops.cc b/lite/operators/activation_grad_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b31163e5dce6d9b77d923ba44ed58952263610a5
--- /dev/null
+++ b/lite/operators/activation_grad_ops.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.i
+
+#include "lite/operators/activation_grad_ops.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool ActivationGradOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X_grad);
+  CHECK_OR_FALSE(param_.Out_grad);
+  return true;
+}
+
+bool ActivationGradOp::InferShapeImpl() const {
+  param_.X_grad->Resize(param_.Out_grad->dims());
+  return true;
+}
+
+bool ActivationGradOp::AttachImpl(const cpp::OpDesc& opdesc,
+                                  lite::Scope* scope) {
+  auto Out_grad_name = opdesc.Input("Out@GRAD").front();
+  auto X_grad_name = opdesc.Output("X@GRAD").front();
+
+  param_.Out_grad = GetVar<lite::Tensor>(scope, Out_grad_name);
+  param_.X_grad = GetMutableVar<Tensor>(scope, X_grad_name);
+
+  if (opdesc.HasInput("X")) {
+    auto X_name = opdesc.Input("X").front();
+    param_.X = GetVar<lite::Tensor>(scope, X_name);
+  } else {
+    param_.X = param_.X_grad;
+  }
+
+  if (opdesc.HasInput("Out")) {
+    auto Out_name = opdesc.Input("Out").front();
+    param_.Out = GetVar<lite::Tensor>(scope, Out_name);
+  } else {
+    param_.Out = param_.Out_grad;
+  }
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(square_grad, paddle::lite::operators::ActivationGradOp);
diff --git a/lite/operators/activation_grad_ops.h b/lite/operators/activation_grad_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf928cfe1bf9945a1dd0474408472759a499b5d7
--- /dev/null
+++ b/lite/operators/activation_grad_ops.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class ActivationGradOp : public OpLite {
+ public:
+  explicit ActivationGradOp(const std::string& type) : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
+
+  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "activation_grad_op"; }
+
+ private:
+  mutable operators::ActivationGradParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/activation_ops.cc b/lite/operators/activation_ops.cc
index 6292c5aef6d473d5e6ea34fd0102a2547f0c81d9..abaaa1a705c2c6995a7d846c1d9add0dab98867b 100644
--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
@@ -25,7 +25,7 @@ bool ActivationOp::CheckShape() const {
   return true;
 }
 
-bool ActivationOp::InferShape() const {
+bool ActivationOp::InferShapeImpl() const {
   param_.Out->Resize(param_.X->dims());
   auto out_lod = param_.Out->mutable_lod();
   *out_lod = param_.X->lod();
@@ -36,74 +36,51 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
   auto x_name = opdesc.Input("X").front();
   auto out_name = opdesc.Output("Out").front();
   param_.X = scope->FindVar(x_name)->GetMutable<lite::Tensor>();
-  if (opdesc.Type() == "leaky_relu") {
+
+  if (opdesc.Type() == "relu") {
+    // relu
+    param_.active_type = lite_api::ActivationType::kRelu;
+  } else if (opdesc.Type() == "leaky_relu") {
+    // leaky_relu
     param_.Leaky_relu_alpha = opdesc.GetAttr<float>("alpha");
-  }
-  if (opdesc.Type() == "relu_clipped") {
+    param_.active_type = lite_api::ActivationType::kLeakyRelu;
+  } else if (opdesc.Type() == "relu_clipped") {
+    // relu_clipped
     param_.Relu_clipped_coef = opdesc.GetAttr<float>("Relu_clipped_coef");
-  }
-  if (opdesc.Type() == "prelu") {
+  } else if (opdesc.Type() == "prelu") {
+    // prelu
     param_.Prelu_mode = opdesc.GetAttr<std::string>("mode");
     auto prelu_alpha_name = opdesc.Input("Alpha").front();
     param_.Prelu_alpha =
         scope->FindVar(prelu_alpha_name)->GetMutable<lite::Tensor>();
-  }
-  if (opdesc.Type() == "swish") {
+    param_.active_type = lite_api::ActivationType::kPRelu;
+  } else if (opdesc.Type() == "swish") {
+    // swish
     param_.Swish_beta = opdesc.GetAttr<float>("beta");
-  }
-
-  if (opdesc.Type() == "hard_sigmoid") {
+    param_.active_type = lite_api::ActivationType::kSwish;
+  } else if (opdesc.Type() == "hard_sigmoid") {
+    // hard_sigomid
     param_.hard_sigmoid_slope = opdesc.GetAttr<float>("slope");
     param_.hard_sigmoid_offset = opdesc.GetAttr<float>("offset");
+  } else if (opdesc.Type() == "sigmoid") {
+    // sigmoid
+    param_.active_type = lite_api::ActivationType::kSigmoid;
+  } else if (opdesc.Type() == "tanh") {
+    // tanh
+    param_.active_type = lite_api::ActivationType::kTanh;
+  } else if (opdesc.Type() == "exp") {
+    // exp
+    param_.active_type = lite_api::ActivationType::kExp;
   }
-  param_.Out = scope->FindVar(out_name)->GetMutable<lite::Tensor>();
-  return true;
-}
-
-#ifdef LITE_WITH_TRAIN
-
-bool ActivationGradOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.X_grad);
-  CHECK_OR_FALSE(param_.Out_grad);
-  return true;
-}
-
-bool ActivationGradOp::InferShape() const {
-  param_.X_grad->Resize(param_.Out_grad->dims());
-  return true;
-}
-
-bool ActivationGradOp::AttachImpl(const cpp::OpDesc& opdesc,
-                                  lite::Scope* scope) {
-  auto Out_grad_name = opdesc.Input(framework::GradVarName("Out")).front();
-  auto X_grad_name = opdesc.Output(framework::GradVarName("X")).front();
-
-  param_.Out_grad = GetVar<lite::Tensor>(scope, Out_grad_name);
-  param_.X_grad = GetMutableVar<Tensor>(scope, X_grad_name);
-
-  if (opdesc.HasInput("X")) {
-    auto X_name = opdesc.Input("X").front();
-    param_.X = GetVar<lite::Tensor>(scope, X_name);
-  } else {
-    param_.X = param_.X_grad;
-  }
-
-  if (opdesc.HasInput("Out")) {
-    auto Out_name = opdesc.Input("Out").front();
-    param_.Out = GetVar<lite::Tensor>(scope, Out_name);
-  } else {
-    param_.Out = param_.Out_grad;
-  }
+  VLOG(4) << "opdesc.Type():" << opdesc.Type();
 
+  param_.Out = scope->FindVar(out_name)->GetMutable<lite::Tensor>();
   return true;
 }
 
-#endif
-
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
-
 REGISTER_LITE_OP(square, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(relu, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(leaky_relu, paddle::lite::operators::ActivationOp);
@@ -121,7 +98,3 @@ REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(rsqrt, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(gelu, paddle::lite::operators::ActivationOp);
-
-#ifdef LITE_WITH_TRAIN
-REGISTER_LITE_OP(square_grad, paddle::lite::operators::ActivationGradOp);
-#endif
diff --git a/lite/operators/activation_ops.h b/lite/operators/activation_ops.h
index 7ff91f7bcd2dce0fbdc4b5e8e4573ecc52387d72..8f81b12af03052e558e7faa2e813039d4dee8988 100644
--- a/lite/operators/activation_ops.h
+++ b/lite/operators/activation_ops.h
@@ -26,7 +26,7 @@ class ActivationOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
 
@@ -38,26 +38,6 @@ class ActivationOp : public OpLite {
   mutable operators::ActivationParam param_;
 };
 
-#ifdef LITE_WITH_TRAIN
-class ActivationGradOp : public OpLite {
- public:
-  explicit ActivationGradOp(const std::string& type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "activation_grad_op"; }
-
- private:
-  mutable operators::ActivationGradParam param_;
-};
-#endif
-
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/affine_channel_op.cc b/lite/operators/affine_channel_op.cc
index c4945ababd2fdf3b0f1b25d26eb0f66c8f613b21..447079deb33bdb893b99901d8559d6961489789d 100644
--- a/lite/operators/affine_channel_op.cc
+++ b/lite/operators/affine_channel_op.cc
@@ -44,7 +44,7 @@ bool AffineChannelOpLite::CheckShape() const {
   return true;
 }
 
-bool AffineChannelOpLite::InferShape() const {
+bool AffineChannelOpLite::InferShapeImpl() const {
   const auto x_dims = param_.X->dims();
   param_.Out->Resize(x_dims);
   return true;
diff --git a/lite/operators/affine_channel_op.h b/lite/operators/affine_channel_op.h
index 85a043bdc8e1c6f41c27b2e57555d3454322f789..5a3d9d66259d477d42ac00e0e1b1a7ba1bf2e862 100644
--- a/lite/operators/affine_channel_op.h
+++ b/lite/operators/affine_channel_op.h
@@ -31,7 +31,7 @@ class AffineChannelOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/anchor_generator_op.cc b/lite/operators/anchor_generator_op.cc
index 8daa54905fcf7cf52259840c26198721d6b8f0fa..e57a4b2df8c75afd28506b5e0e2f7b7aa142b838 100644
--- a/lite/operators/anchor_generator_op.cc
+++ b/lite/operators/anchor_generator_op.cc
@@ -31,7 +31,7 @@ bool AnchorGeneratorOpLite::CheckShape() const {
   return true;
 }
 
-bool AnchorGeneratorOpLite::InferShape() const {
+bool AnchorGeneratorOpLite::InferShapeImpl() const {
   auto input_dims = param_.Input->dims();
   size_t num_anchors = param_.aspect_ratios.size() * param_.anchor_sizes.size();
   std::vector<int64_t> output_shape(
diff --git a/lite/operators/anchor_generator_op.h b/lite/operators/anchor_generator_op.h
index 46e5e0fac243c10b62122327ef06ea166878e54f..2ff3422824c15b54ed1fa3ca9952745d5b1706ac 100644
--- a/lite/operators/anchor_generator_op.h
+++ b/lite/operators/anchor_generator_op.h
@@ -32,7 +32,7 @@ class AnchorGeneratorOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/argmax_op.cc b/lite/operators/argmax_op.cc
index 6b246603e1f640316e32804465a72c01b7984bfd..b733998ae57785483f539b56dcb47b7b50f04cf0 100644
--- a/lite/operators/argmax_op.cc
+++ b/lite/operators/argmax_op.cc
@@ -24,15 +24,18 @@ namespace operators {
 bool ArgmaxOpLite::CheckShape() const {
   CHECK_OR_FALSE(param_.X);
   CHECK_OR_FALSE(param_.Out);
-  CHECK_OR_FALSE(param_.Axis < (param_.X)->dims().size());
+  CHECK_OR_FALSE(param_.Axis < static_cast<int>((param_.X)->dims().size()));
+  CHECK_OR_FALSE(param_.Axis >= static_cast<int>(-(param_.X)->dims().size()));
   return true;
 }
 
-bool ArgmaxOpLite::InferShape() const {
+bool ArgmaxOpLite::InferShapeImpl() const {
   auto x_dims = param_.X->dims();
   int x_rank = x_dims.size();
   int axis = param_.Axis;
-  if (axis < 0) axis += x_rank;
+  if (axis < 0) {
+    axis += x_rank;
+  }
 
   std::vector<int64_t> out_dims;
   for (int64_t i = 0; i < axis; i++) out_dims.push_back(x_dims[i]);
diff --git a/lite/operators/argmax_op.h b/lite/operators/argmax_op.h
index a5accc97e3b9f3bb2fbd00f45fd3a45063e5c747..e6944507cf9f6ded86ccbae7c3cec79106e8ba98 100644
--- a/lite/operators/argmax_op.h
+++ b/lite/operators/argmax_op.h
@@ -31,7 +31,7 @@ class ArgmaxOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/assign_op.cc b/lite/operators/assign_op.cc
index 8510b7e8b7b8a5732e0e09d3db494ab3eb9f15a8..25e8539d2e55a07a19d707713489d86f84aa64db 100644
--- a/lite/operators/assign_op.cc
+++ b/lite/operators/assign_op.cc
@@ -26,7 +26,7 @@ bool AssignOpLite::CheckShape() const {
   return true;
 }
 
-bool AssignOpLite::InferShape() const {
+bool AssignOpLite::InferShapeImpl() const {
   lite::DDim input_dims;
   input_dims = param_.X->dims();
   param_.Out->Resize(lite::DDim(input_dims));
diff --git a/lite/operators/assign_op.h b/lite/operators/assign_op.h
index 555356c3659ff31c84b2630c1f5da6acab003823..9e7039bb5b0088a6bda6acbf2baf7a50444df8b2 100644
--- a/lite/operators/assign_op.h
+++ b/lite/operators/assign_op.h
@@ -30,7 +30,7 @@ class AssignOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/assign_value_op.cc b/lite/operators/assign_value_op.cc
index 046c5222283fc73bd3af1e53520b1fc5539bcd31..ff5b55735f7b58aa2eaa2274574336dadd8061e6 100644
--- a/lite/operators/assign_value_op.cc
+++ b/lite/operators/assign_value_op.cc
@@ -35,7 +35,7 @@ bool AssignValueOpLite::CheckShape() const {
   return true;
 }
 
-bool AssignValueOpLite::InferShape() const {
+bool AssignValueOpLite::InferShapeImpl() const {
   std::vector<int> shape = param_.shape;
   std::vector<int64_t> out_shape;
   for (size_t i = 0; i < shape.size(); i++) out_shape.push_back(shape[i]);
diff --git a/lite/operators/assign_value_op.h b/lite/operators/assign_value_op.h
index 7bf220615935f02051ed606adb894bf9842378f3..030da048184c9862b76f59198574b394457768d5 100644
--- a/lite/operators/assign_value_op.h
+++ b/lite/operators/assign_value_op.h
@@ -31,7 +31,7 @@ class AssignValueOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/attention_padding_mask_op.cc b/lite/operators/attention_padding_mask_op.cc
index a88df0e7a902c6cac63eb77377bb0b49ee30c9b3..2f3a0cd265c56ac24548e23ff3daf09e27e1d800 100644
--- a/lite/operators/attention_padding_mask_op.cc
+++ b/lite/operators/attention_padding_mask_op.cc
@@ -28,7 +28,7 @@ bool AttentionPaddingMaskOp::CheckShape() const {
   return true;
 }
 
-bool AttentionPaddingMaskOp::InferShape() const {
+bool AttentionPaddingMaskOp::InferShapeImpl() const {
   auto src_len = param_.X->lod()[0][1];
   CHECK_EQ(src_len, param_.X->dims()[1])
       << "Mismatch source length, expect: " << src_len
diff --git a/lite/operators/attention_padding_mask_op.h b/lite/operators/attention_padding_mask_op.h
index 894d68f6226720139aee07274d4ac5cf660749f1..6a2443fc6749d4f2066ee761fd194441e2fe46cd 100644
--- a/lite/operators/attention_padding_mask_op.h
+++ b/lite/operators/attention_padding_mask_op.h
@@ -29,7 +29,7 @@ class AttentionPaddingMaskOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/axpy_op.cc b/lite/operators/axpy_op.cc
index 60f302862afa47ca75ae703e7b848bb3a0e7604c..c1c6304c3119f89bdc46400b2478a767c914d001 100644
--- a/lite/operators/axpy_op.cc
+++ b/lite/operators/axpy_op.cc
@@ -34,7 +34,7 @@ bool AxpyOpLite::CheckShape() const {
   return true;
 }
 
-bool AxpyOpLite::InferShape() const {
+bool AxpyOpLite::InferShapeImpl() const {
   auto dims = param_.Bias->dims();
 
   // Set output dims
diff --git a/lite/operators/axpy_op.h b/lite/operators/axpy_op.h
index 1fa8540743f65db864f33633003b4ed8f6d8cb92..e9d9f44ca5f5843628af998d9140519a3f3a1c29 100644
--- a/lite/operators/axpy_op.h
+++ b/lite/operators/axpy_op.h
@@ -31,7 +31,7 @@ class AxpyOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/batch_norm_op.cc b/lite/operators/batch_norm_op.cc
index 76c257c6d34f0a82a920eaf49c1ef88efbd0daf4..67e037fba349e811f1faf991c84310b11ab7a13c 100644
--- a/lite/operators/batch_norm_op.cc
+++ b/lite/operators/batch_norm_op.cc
@@ -46,7 +46,7 @@ bool BatchNormOp::CheckShape() const {
   return true;
 }
 
-bool BatchNormOp::InferShape() const {
+bool BatchNormOp::InferShapeImpl() const {
   auto x_dims = param_.x->dims();
   int64_t channel_size = 0;
   switch (param_.data_layout) {
@@ -68,6 +68,7 @@ bool BatchNormOp::InferShape() const {
     param_.saved_variance->Resize({channel_size});
   }
   param_.y->Resize(x_dims);
+  param_.y->set_lod(param_.x->lod());
   return true;
 }
 
diff --git a/lite/operators/batch_norm_op.h b/lite/operators/batch_norm_op.h
index 21dbf9a28a4257acdd80ac6c49d111cdd757b65d..9598763713564192ed4ad0c99200f0fdb1d88d37 100644
--- a/lite/operators/batch_norm_op.h
+++ b/lite/operators/batch_norm_op.h
@@ -30,7 +30,7 @@ class BatchNormOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/beam_search_decode_op.cc b/lite/operators/beam_search_decode_op.cc
index f45e068c1fc959f15797a43d3908174f54c59852..444c9d6a11217c3134c3cb1f988c60c4b98d4566 100644
--- a/lite/operators/beam_search_decode_op.cc
+++ b/lite/operators/beam_search_decode_op.cc
@@ -28,7 +28,7 @@ bool BeamSearchDecodeOpLite::CheckShape() const {
   return true;
 }
 
-bool BeamSearchDecodeOpLite::InferShape() const { return true; }
+bool BeamSearchDecodeOpLite::InferShapeImpl() const { return true; }
 
 bool BeamSearchDecodeOpLite::AttachImpl(const cpp::OpDesc &op_desc,
                                         lite::Scope *scope) {
@@ -40,10 +40,8 @@ bool BeamSearchDecodeOpLite::AttachImpl(const cpp::OpDesc &op_desc,
   param_.ids = scope->FindVar(ids)->GetMutable<std::vector<lite::Tensor>>();
   param_.scores =
       scope->FindVar(scores)->GetMutable<std::vector<lite::Tensor>>();
-  param_.sentence_ids =
-      scope->FindVar(sentence_ids)->GetMutable<lite::Tensor>();
-  param_.sentence_scores =
-      scope->FindVar(sentence_scores)->GetMutable<lite::Tensor>();
+  param_.sentence_ids = scope->FindMutableTensor(sentence_ids);
+  param_.sentence_scores = scope->FindMutableTensor(sentence_scores);
 
   param_.beam_size = op_desc.GetAttr<int>("beam_size");
   param_.end_id = op_desc.GetAttr<int>("end_id");
diff --git a/lite/operators/beam_search_decode_op.h b/lite/operators/beam_search_decode_op.h
index 9d324d2bf0974fe5b65711c4ab2dacaf0d0d65d9..38bf9929ab12ba764fcd3fe6cacc7c08f35c15ca 100644
--- a/lite/operators/beam_search_decode_op.h
+++ b/lite/operators/beam_search_decode_op.h
@@ -31,7 +31,7 @@ class BeamSearchDecodeOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/beam_search_op.cc b/lite/operators/beam_search_op.cc
index 4e340b8da7e47f4ccdf04c6756d2ea89dbc874fb..ea777ad53395aba1c7d6c21b07013e374b03c1f4 100644
--- a/lite/operators/beam_search_op.cc
+++ b/lite/operators/beam_search_op.cc
@@ -30,24 +30,20 @@ bool BeamSearchOp::CheckShape() const {
   return true;
 }
 
-bool BeamSearchOp::InferShape() const { return true; }
+bool BeamSearchOp::InferShapeImpl() const { return true; }
 
 bool BeamSearchOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.pre_ids = scope->FindVar(opdesc.Input("pre_ids").front())
-                       ->GetMutable<lite::Tensor>();
-  param_.pre_scores = scope->FindVar(opdesc.Input("pre_scores").front())
-                          ->GetMutable<lite::Tensor>();
-  param_.ids =
-      scope->FindVar(opdesc.Input("ids").front())->GetMutable<lite::Tensor>();
-  param_.scores = scope->FindVar(opdesc.Input("scores").front())
-                      ->GetMutable<lite::Tensor>();
-  param_.selected_ids = scope->FindVar(opdesc.Output("selected_ids").front())
-                            ->GetMutable<lite::Tensor>();
+  param_.pre_ids = scope->FindTensor(opdesc.Input("pre_ids").front());
+  param_.pre_scores = scope->FindTensor(opdesc.Input("pre_scores").front());
+  param_.ids = scope->FindTensor(opdesc.Input("ids").front());
+  param_.scores = scope->FindTensor(opdesc.Input("scores").front());
+  param_.selected_ids =
+      scope->FindMutableTensor(opdesc.Output("selected_ids").front());
   param_.selected_scores =
-      scope->FindVar(opdesc.Output("selected_scores").front())
-          ->GetMutable<lite::Tensor>();
-  param_.parent_idx = scope->FindVar(opdesc.Output("parent_idx").front())
-                          ->GetMutable<lite::Tensor>();
+      scope->FindMutableTensor(opdesc.Output("selected_scores").front());
+  param_.parent_idx =
+      scope->FindMutableTensor(opdesc.Output("parent_idx").front());
+
   CHECK(param_.pre_ids) << "id null";
   CHECK(param_.pre_scores) << "pre score null";
   CHECK(param_.ids) << "ids null";
diff --git a/lite/operators/beam_search_op.h b/lite/operators/beam_search_op.h
index 42a6058de112215f525b51bfff6ff16aae04391d..7e325cb55668a77cf09466e86be220218a49cbee 100644
--- a/lite/operators/beam_search_op.h
+++ b/lite/operators/beam_search_op.h
@@ -30,7 +30,7 @@ class BeamSearchOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/box_clip_op.cc b/lite/operators/box_clip_op.cc
index 6bd93c6ea4e2efc93fdc7e64f1738c2ac3d40997..08ba49bd9ada076c6650249f67af15174491f634 100644
--- a/lite/operators/box_clip_op.cc
+++ b/lite/operators/box_clip_op.cc
@@ -35,7 +35,7 @@ bool BoxClipOpLite::CheckShape() const {
   return true;
 }
 
-bool BoxClipOpLite::InferShape() const {
+bool BoxClipOpLite::InferShapeImpl() const {
   auto* input = param_.Input;
   auto* output = param_.Output;
   output->Resize(input->dims());
diff --git a/lite/operators/box_clip_op.h b/lite/operators/box_clip_op.h
index c7e07b1015c52eb5711638163bda327c11152dd0..0aae2112ec8b91ba63205fadd4123bc3c5fce2fd 100644
--- a/lite/operators/box_clip_op.h
+++ b/lite/operators/box_clip_op.h
@@ -31,7 +31,7 @@ class BoxClipOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/box_coder_op.cc b/lite/operators/box_coder_op.cc
index c86f494fc4f96f688c30027f1d6aa1ee452da8f0..3133176b35ecae49ed9171ef6e8b519c6774ce5d 100644
--- a/lite/operators/box_coder_op.cc
+++ b/lite/operators/box_coder_op.cc
@@ -35,7 +35,7 @@ bool BoxCoderOpLite::CheckShape() const {
   return true;
 }
 
-bool BoxCoderOpLite::InferShape() const {
+bool BoxCoderOpLite::InferShapeImpl() const {
   auto prior_box_dims = param_.prior_box->dims();
   auto target_box_dims = param_.target_box->dims();
   std::string code_type = param_.code_type;
diff --git a/lite/operators/box_coder_op.h b/lite/operators/box_coder_op.h
index 61d54fd484ff377763e00f1d71bff1c0c6f89398..51e86423e39786426d53fe8ced861866bfeb1053 100644
--- a/lite/operators/box_coder_op.h
+++ b/lite/operators/box_coder_op.h
@@ -29,7 +29,7 @@ class BoxCoderOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/calib_op.cc b/lite/operators/calib_op.cc
index da00f01c3206c81fb89749432383ea8d99c14dc1..8da8747f8c9df038ee424395fd75a20a718f1970 100644
--- a/lite/operators/calib_op.cc
+++ b/lite/operators/calib_op.cc
@@ -24,7 +24,7 @@ bool CalibOpLite::CheckShape() const {
   CHECK_OR_FALSE(param_.output);
   return true;
 }
-bool CalibOpLite::InferShape() const {
+bool CalibOpLite::InferShapeImpl() const {
   param_.output->Resize(param_.input->dims());
   return true;
 }
diff --git a/lite/operators/calib_op.h b/lite/operators/calib_op.h
index d575766c10d1e6cd66bf7f8117315ffe21fe10fe..94240880f55e782f025fe5777eba19e0c96cfbee 100644
--- a/lite/operators/calib_op.h
+++ b/lite/operators/calib_op.h
@@ -42,7 +42,7 @@ class CalibOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope);
 
diff --git a/lite/operators/cast_op.cc b/lite/operators/cast_op.cc
index 9ece0a45a3e997e4d1663755f42f6b42efb86c5d..da12e2afded2c23565080b06409ce35b0535c4ff 100644
--- a/lite/operators/cast_op.cc
+++ b/lite/operators/cast_op.cc
@@ -25,7 +25,7 @@ bool CastOp::CheckShape() const {
   return true;
 }
 
-bool CastOp::InferShape() const {
+bool CastOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   auto out_dims = param_.X->dims();
diff --git a/lite/operators/cast_op.h b/lite/operators/cast_op.h
index 2f5f57f12740d085bda36141299cfbe7c798c378..e045ef89f73d0ac29b0f03e148ad651c1513668f 100644
--- a/lite/operators/cast_op.h
+++ b/lite/operators/cast_op.h
@@ -30,7 +30,7 @@ class CastOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/collect_fpn_proposals_op.cc b/lite/operators/collect_fpn_proposals_op.cc
index 4731d4bf81c241c6733b1403699874c1053d2b7f..27dd9a50b6fb0a9943b7a9d86be390cbc6d406b0 100644
--- a/lite/operators/collect_fpn_proposals_op.cc
+++ b/lite/operators/collect_fpn_proposals_op.cc
@@ -43,7 +43,7 @@ bool CollectFpnProposalsOpLite::CheckShape() const {
   return true;
 }
 
-bool CollectFpnProposalsOpLite::InferShape() const {
+bool CollectFpnProposalsOpLite::InferShapeImpl() const {
   param_.fpn_rois->Resize({param_.post_nms_topN, 4});
 
   return true;
diff --git a/lite/operators/collect_fpn_proposals_op.h b/lite/operators/collect_fpn_proposals_op.h
index 1ae7bb269ff53bb8add92d9afc8d462c45cb5f0b..b3104e81d5ff8d82083a7b37ffd88dd169b840c9 100644
--- a/lite/operators/collect_fpn_proposals_op.h
+++ b/lite/operators/collect_fpn_proposals_op.h
@@ -32,7 +32,7 @@ class CollectFpnProposalsOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/compare_op.cc b/lite/operators/compare_op.cc
index aa500ba35c37cf8af17091d8d37d8fd8d1a08e0e..f458eae71edea6086e8947ae8881f6f218e49808 100644
--- a/lite/operators/compare_op.cc
+++ b/lite/operators/compare_op.cc
@@ -26,7 +26,7 @@ bool CompareOp::CheckShape() const {
   return true;
 }
 
-bool CompareOp::InferShape() const {
+bool CompareOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   auto input_dims = param_.X->dims();
diff --git a/lite/operators/compare_op.h b/lite/operators/compare_op.h
index 7ca21caaa1347f248213b2b43293ca18d514ba9a..c94cf88516af7676f8e524c091713cbaa4dd70ff 100644
--- a/lite/operators/compare_op.h
+++ b/lite/operators/compare_op.h
@@ -30,7 +30,7 @@ class CompareOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/concat_op.cc b/lite/operators/concat_op.cc
index b2f7438b64aa34787896839f020f0b056e6453fb..c15bf292897006b3c6d5e67bcfaea5d0e590a82d 100644
--- a/lite/operators/concat_op.cc
+++ b/lite/operators/concat_op.cc
@@ -26,7 +26,7 @@ bool ConcatOpLite::CheckShape() const {
   return true;
 }
 
-bool ConcatOpLite::InferShape() const {
+bool ConcatOpLite::InferShapeImpl() const {
   const std::vector<Tensor *> &inputs = param_.x;
   const size_t n = inputs.size();
   CHECK_GT_OR_FALSE(n, 0);
diff --git a/lite/operators/concat_op.h b/lite/operators/concat_op.h
index acc41de9b36cf6a808788a4f585e8a9c7f049717..2ac1572c833db217546aaa176640cb5c1022d3bf 100644
--- a/lite/operators/concat_op.h
+++ b/lite/operators/concat_op.h
@@ -30,7 +30,7 @@ class ConcatOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/conditional_block_op.cc b/lite/operators/conditional_block_op.cc
index c79c4e20a29834e858bc670104e2a09e55888c85..e3678e92c9d33be5428c82331ce963f4c6067369 100644
--- a/lite/operators/conditional_block_op.cc
+++ b/lite/operators/conditional_block_op.cc
@@ -27,7 +27,7 @@ bool ConditionalBlockOpLite::CheckShape() const {
   return true;
 }
 
-bool ConditionalBlockOpLite::InferShape() const { return true; }
+bool ConditionalBlockOpLite::InferShapeImpl() const { return true; }
 
 bool ConditionalBlockOpLite::AttachImpl(const cpp::OpDesc &op_desc,
                                         lite::Scope *scope) {
diff --git a/lite/operators/conditional_block_op.h b/lite/operators/conditional_block_op.h
index 5518c255c5799aa5b44557a4493275794fd598f5..1815731c8df3ac07bee80aa8e0cc658e752b5c4f 100644
--- a/lite/operators/conditional_block_op.h
+++ b/lite/operators/conditional_block_op.h
@@ -31,7 +31,7 @@ class ConditionalBlockOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc
index 9ae52d1cb6a406dc8d1059ad97f3757dbc0a31fa..38c59a0290b03031e9cbe013a4a10c14c7ad1743 100644
--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
@@ -80,7 +80,7 @@ void UpdatePaddingAndDilation(std::vector<int>* paddings,
   }
 }
 
-bool ConvOpLite::InferShape() const {
+bool ConvOpLite::InferShapeImpl() const {
   const auto in_dims = param_.x->dims();
   const auto filter_dims = param_.filter->dims();
 
@@ -104,9 +104,9 @@ bool ConvOpLite::InferShape() const {
 
   // Set output dims
   param_.output->Resize(lite::DDim(output_shape));
-
   // share LoD
-  // param_.output->set_lod(param_.x->lod());
+  param_.output->set_lod(param_.x->lod());
+
   return true;
 }
 
diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h
index 63107022f1ef69a21d37373c4a257625f8b0f5e3..eab17fe6db0a59a9eb0eea0ab7344758a8232d15 100644
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -34,8 +34,7 @@ class ConvOpLite : public OpLite {
   explicit ConvOpLite(const std::string& type) : OpLite(type) {}
 
   bool CheckShape() const override;
-
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   // TODO(Superjomn) replace framework::OpDesc with a lite one.
   bool AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) override {
diff --git a/lite/operators/conv_transpose_op.cc b/lite/operators/conv_transpose_op.cc
index a84b975492040ec0bdc1326f33f8b7edafdea2bb..511a5157ad58e5e2d7bda5c4d0de136c9b3f9590 100644
--- a/lite/operators/conv_transpose_op.cc
+++ b/lite/operators/conv_transpose_op.cc
@@ -52,7 +52,7 @@ inline int ConvTransposeOutputSize(int input_size,
   return output_size;
 }
 
-bool ConvTransposeOpLite::InferShape() const {
+bool ConvTransposeOpLite::InferShapeImpl() const {
   const auto in_dims = param_.x->dims();
   const auto filter_dims = param_.filter->dims();
 
diff --git a/lite/operators/conv_transpose_op.h b/lite/operators/conv_transpose_op.h
index fb25c022f974ad195bf72b19cb9b459b2d11d5f2..891ece4f052128c8c236db5650414d6015ea9565 100644
--- a/lite/operators/conv_transpose_op.h
+++ b/lite/operators/conv_transpose_op.h
@@ -34,7 +34,7 @@ class ConvTransposeOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/crf_decoding_op.cc b/lite/operators/crf_decoding_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b1af573518bc483b6eaf5e013609583b548fb300
--- /dev/null
+++ b/lite/operators/crf_decoding_op.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/crf_decoding_op.h"
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool CrfDecodingOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.emission);
+  CHECK_OR_FALSE(param_.transition);
+  CHECK_OR_FALSE(param_.viterbi_path);
+
+  auto emission_dims = param_.emission->dims();
+  if (param_.length == nullptr) {
+    CHECK_OR_FALSE(emission_dims.size() == 2);
+  } else {
+    CHECK_OR_FALSE(emission_dims.size() == 3);
+  }
+  CHECK_OR_FALSE(emission_dims[0] != 0);
+
+  auto transition_dims = param_.transition->dims();
+  CHECK_OR_FALSE(transition_dims.size() == 2);
+  CHECK_OR_FALSE(transition_dims[0] - 2 == transition_dims[1]);
+
+  if ((emission_dims[emission_dims.size() - 1] > 0 &&
+       transition_dims[transition_dims.size() - 1] > 0)) {
+    CHECK_OR_FALSE(emission_dims[emission_dims.size() - 1] ==
+                   transition_dims[transition_dims.size() - 1]);
+  }
+
+  if (param_.label != nullptr) {
+    auto label_dims = param_.label->dims();
+    if (param_.length != nullptr) {
+      CHECK_OR_FALSE((label_dims.size() == 3UL && label_dims[2] == 1) ||
+                     label_dims.size() == 2UL);
+    } else {
+      CHECK_OR_FALSE((label_dims.size() == 2UL && label_dims[1] == 1) ||
+                     label_dims.size() == 1UL);
+    }
+    if (emission_dims[0] > 0 && label_dims[0] > 0) {
+      CHECK_OR_FALSE(emission_dims[0] == label_dims[0]);
+    }
+  }
+  return true;
+}
+
+bool CrfDecodingOpLite::InferShapeImpl() const {
+  auto emission_dims = param_.emission->dims();
+  if (param_.length == nullptr) {
+    param_.viterbi_path->Resize({emission_dims[0], 1});
+  } else {
+    param_.viterbi_path->Resize({emission_dims[0], emission_dims[1]});
+  }
+  param_.viterbi_path->set_lod(param_.emission->lod());
+  return true;
+}
+
+bool CrfDecodingOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                   lite::Scope *scope) {
+  // inputs
+  param_.emission = scope->FindVar(op_desc.Input("Emission").front())
+                        ->GetMutable<lite::Tensor>();
+  param_.transition = scope->FindVar(op_desc.Input("Transition").front())
+                          ->GetMutable<lite::Tensor>();
+  if (op_desc.HasInput("Label") && op_desc.Input("Label").size() > 0) {
+    param_.label = scope->FindVar(op_desc.Input("Label").front())
+                       ->GetMutable<lite::Tensor>();
+  }
+  if (op_desc.HasInput("Length") && op_desc.Input("Length").size() > 0) {
+    param_.length = scope->FindVar(op_desc.Input("Length").front())
+                        ->GetMutable<lite::Tensor>();
+  }
+
+  // outputs
+  param_.viterbi_path = scope->FindVar(op_desc.Output("ViterbiPath").front())
+                            ->GetMutable<lite::Tensor>();
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(crf_decoding, paddle::lite::operators::CrfDecodingOpLite);
diff --git a/lite/operators/crf_decoding_op.h b/lite/operators/crf_decoding_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4bc50410ab0504b3e25585caba7f8fff823553b0
--- /dev/null
+++ b/lite/operators/crf_decoding_op.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class CrfDecodingOpLite : public OpLite {
+ public:
+  CrfDecodingOpLite() {}
+
+  explicit CrfDecodingOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "crf_decoding"; }
+
+ private:
+  mutable CrfDecodingParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/crop_op.cc b/lite/operators/crop_op.cc
index 1a27cfb34d958176c8ad0a6e17d7e17e5287d2d5..4905d92e587ea10783fe7a3cb88b6ee67761c73e 100644
--- a/lite/operators/crop_op.cc
+++ b/lite/operators/crop_op.cc
@@ -26,7 +26,7 @@ bool CropOpLite::CheckShape() const {
   return true;
 }
 
-bool CropOpLite::InferShape() const {
+bool CropOpLite::InferShapeImpl() const {
   // nchw
   auto x_dims = param_.X->dims();
   lite::DDim output_shape(x_dims);
diff --git a/lite/operators/crop_op.h b/lite/operators/crop_op.h
index f21278e891d265093c26be1f96e416974af13b2e..bd3d0e71d8780fab16134ba347f3208249403bd7 100644
--- a/lite/operators/crop_op.h
+++ b/lite/operators/crop_op.h
@@ -30,7 +30,7 @@ class CropOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/decode_bboxes_op.cc b/lite/operators/decode_bboxes_op.cc
index e22adf1774427e10e3fa146e388a6ce365f86021..1903267c3aa46e048787f007a5c9cede8c574c5a 100644
--- a/lite/operators/decode_bboxes_op.cc
+++ b/lite/operators/decode_bboxes_op.cc
@@ -29,7 +29,7 @@ bool DecodeBboxesOpLite::CheckShape() const {
   return true;
 }
 
-bool DecodeBboxesOpLite::InferShape() const {
+bool DecodeBboxesOpLite::InferShapeImpl() const {
   param_.bbox_data->Resize(param_.loc_data->dims());
   return true;
 }
diff --git a/lite/operators/decode_bboxes_op.h b/lite/operators/decode_bboxes_op.h
index c463992c8da6b042d5df027b03e64a594ede8a02..8848a1c26cd9363595a3200fc6e2535751f72df0 100644
--- a/lite/operators/decode_bboxes_op.h
+++ b/lite/operators/decode_bboxes_op.h
@@ -29,7 +29,7 @@ class DecodeBboxesOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/density_prior_box_op.cc b/lite/operators/density_prior_box_op.cc
index 86830df2f19b5615e8b9cfb4b3b57eb22000f588..5ac3eef63bb59c80bffaf3bed558b3ac5baf4d61 100644
--- a/lite/operators/density_prior_box_op.cc
+++ b/lite/operators/density_prior_box_op.cc
@@ -27,7 +27,7 @@ bool DensityPriorBoxOpLite::CheckShape() const {
   return true;
 }
 
-bool DensityPriorBoxOpLite::InferShape() const { return true; }
+bool DensityPriorBoxOpLite::InferShapeImpl() const { return true; }
 
 bool DensityPriorBoxOpLite::AttachImpl(const cpp::OpDesc& opdesc,
                                        lite::Scope* scope) {
diff --git a/lite/operators/density_prior_box_op.h b/lite/operators/density_prior_box_op.h
index bad55ad3b7046da45663a2cdd41243ecd5d41cb0..d84b20557fab101ba60f0af58234ffca4e672a57 100644
--- a/lite/operators/density_prior_box_op.h
+++ b/lite/operators/density_prior_box_op.h
@@ -30,7 +30,7 @@ class DensityPriorBoxOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/distribute_fpn_proposals_op.cc b/lite/operators/distribute_fpn_proposals_op.cc
index 5d6a0fca923dd38fd456e024ec14ba7c2685163d..a23c5e1ffb50b1d22a42d5e68bd424d078e83110 100644
--- a/lite/operators/distribute_fpn_proposals_op.cc
+++ b/lite/operators/distribute_fpn_proposals_op.cc
@@ -32,7 +32,7 @@ bool DistributeFpnProposalsOpLite::CheckShape() const {
   return true;
 }
 
-bool DistributeFpnProposalsOpLite::InferShape() const {
+bool DistributeFpnProposalsOpLite::InferShapeImpl() const {
   int num_out_rois = param_.max_level - param_.min_level + 1;
   for (int i = 0; i < num_out_rois; i++) {
     param_.multi_fpn_rois[i]->Resize({-1, 4});
diff --git a/lite/operators/distribute_fpn_proposals_op.h b/lite/operators/distribute_fpn_proposals_op.h
index 2390e329329f7406f05ba69b3768556f94a02bec..22ab2006e072ea36037cb05faaca324a7d2922c9 100644
--- a/lite/operators/distribute_fpn_proposals_op.h
+++ b/lite/operators/distribute_fpn_proposals_op.h
@@ -32,7 +32,7 @@ class DistributeFpnProposalsOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/dropout_op.cc b/lite/operators/dropout_op.cc
index 03047de3b318ee2221809ee602d94f204568d723..858cc6d9197433985aabfb428993d2fa1333527e 100644
--- a/lite/operators/dropout_op.cc
+++ b/lite/operators/dropout_op.cc
@@ -26,7 +26,7 @@ bool DropoutOp::CheckShape() const {
   return true;
 }
 
-bool DropoutOp::InferShape() const {
+bool DropoutOp::InferShapeImpl() const {
   const auto x_dims = param_.x->dims();
   param_.output->Resize(x_dims);
   if (param_.is_test == false) {
diff --git a/lite/operators/dropout_op.h b/lite/operators/dropout_op.h
index 97e17e350c6a87a82e3cf05635d9575269489d7a..bdf0e1d9046178b48f2b4917840eee6ac8572c5a 100644
--- a/lite/operators/dropout_op.h
+++ b/lite/operators/dropout_op.h
@@ -28,7 +28,7 @@ class DropoutOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
   // TODO(Superjomn) replace framework::OpDesc with a lite one.
diff --git a/lite/operators/elementwise_grad_ops.cc b/lite/operators/elementwise_grad_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..730785ba6e6553e6a306f87bdbc63ea5b1017f0a
--- /dev/null
+++ b/lite/operators/elementwise_grad_ops.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/elementwise_grad_ops.h"
+#include <algorithm>
+#include <cmath>
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool ElementwiseGradOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.XGrad || param_.YGrad);
+  CHECK_OR_FALSE(param_.OutGrad);
+  return true;
+}
+
+bool ElementwiseGradOp::InferShapeImpl() const {
+  auto x_dim = param_.X->dims();
+  auto y_dim = param_.Y->dims();
+  if (param_.XGrad) {
+    param_.XGrad->Resize(x_dim);
+  }
+  if (param_.YGrad) {
+    param_.YGrad->Resize(y_dim);
+  }
+  return true;
+}
+
+bool ElementwiseGradOp::AttachImpl(const cpp::OpDesc& opdesc,
+                                   lite::Scope* scope) {
+  auto Y_name = opdesc.Input("Y").front();
+  auto X_name = opdesc.Input("X").front();
+  auto Out_name = opdesc.Input("Out@GRAD").front();
+  CHECK(!opdesc.Output("X@GRAD").empty() || !opdesc.Output("Y@GRAD").empty())
+      << "at least one of 'X@GRAD' and 'Y@GRAD' is not empty";
+
+  if (!opdesc.Output("X@GRAD").empty()) {
+    auto x_grad_name = opdesc.Output("X@GRAD").front();
+    param_.XGrad = GetMutableVar<lite::Tensor>(scope, x_grad_name);
+  }
+  if (!opdesc.Output("Y@GRAD").empty()) {
+    auto y_grad_name = opdesc.Output("Y@GRAD").front();
+    param_.YGrad = GetMutableVar<lite::Tensor>(scope, y_grad_name);
+  }
+
+  param_.X = GetVar<lite::Tensor>(scope, X_name);
+  param_.Y = GetVar<lite::Tensor>(scope, Y_name);
+  param_.OutGrad = GetVar<lite::Tensor>(scope, Out_name);
+  param_.axis = opdesc.GetAttr<int>("axis");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(elementwise_sub_grad,
+                 paddle::lite::operators::ElementwiseGradOp);
+REGISTER_LITE_OP(elementwise_add_grad,
+                 paddle::lite::operators::ElementwiseGradOp);
+
+REGISTER_LITE_OP(elementwise_grad_mul,
+                 paddle::lite::operators::ElementwiseGradOp);
+REGISTER_LITE_OP(elementwise_grad_max,
+                 paddle::lite::operators::ElementwiseGradOp);
diff --git a/lite/operators/elementwise_grad_ops.h b/lite/operators/elementwise_grad_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca8a3241349b4cdc04e4800a0a88b215f586ba72
--- /dev/null
+++ b/lite/operators/elementwise_grad_ops.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class ElementwiseGradOp : public OpLite {
+ public:
+  explicit ElementwiseGradOp(const std::string& op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
+
+  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "elementwise_grad_op"; }
+
+ private:
+  mutable operators::ElementwiseGradParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/elementwise_ops.cc b/lite/operators/elementwise_ops.cc
index a23b1b4c1da8b3004aa24f5ddf9b76d42b7e7e15..f4debc39a0d480f38e6d37e8e60d516def7f0b55 100644
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
@@ -13,8 +13,9 @@
 // limitations under the License.
 
 #include "lite/operators/elementwise_ops.h"
+#include <algorithm>
+#include <cmath>
 #include "lite/core/op_registry.h"
-
 namespace paddle {
 namespace lite {
 namespace operators {
@@ -26,11 +27,61 @@ bool ElementwiseOp::CheckShape() const {
   return true;
 }
 
-bool ElementwiseOp::InferShape() const {
-  CHECK_OR_FALSE(param_.X->dims().size() >= param_.Y->dims().size());
-  param_.Out->Resize(param_.X->dims());
-  auto out_lod = param_.Out->mutable_lod();
-  *out_lod = param_.X->lod();
+bool ElementwiseOp::InferShapeImpl() const {
+  auto x_dim = param_.X->dims();
+  auto y_dim = param_.Y->dims();
+  if (x_dim == y_dim) {
+    param_.Out->Resize(x_dim);
+    auto out_lod = param_.Out->mutable_lod();
+    *out_lod = param_.X->lod();
+  } else {
+    int max_dim = (x_dim.size() > y_dim.size() ? x_dim.size() : y_dim.size());
+    int axis = param_.axis;
+    axis = (axis == -1 ? std::abs(static_cast<int>(x_dim.size() - y_dim.size()))
+                       : axis);
+    std::vector<int64_t> x_dims_array(max_dim);
+    std::vector<int64_t> y_dims_array(max_dim);
+    std::vector<int64_t> out_dims_array(max_dim);
+
+    if (x_dim.size() > y_dim.size()) {
+      for (int i = 0; i < axis; ++i) {
+        y_dims_array[i] = 1;
+      }
+      if (axis + y_dim.size() < max_dim) {
+        for (int i = axis + y_dim.size(); i < max_dim; ++i) {
+          y_dims_array[i] = 1;
+        }
+      }
+      x_dims_array = x_dim.Vectorize();
+      for (int i = 0; i < y_dim.size(); ++i) {
+        y_dims_array[i + axis] = y_dim[i];
+      }
+    } else {
+      for (int i = 0; i < axis; ++i) {
+        x_dims_array[i] = 1;
+      }
+      if (axis + x_dim.size() < max_dim) {
+        for (int i = axis + x_dim.size(); i < max_dim; ++i) {
+          x_dims_array[i] = 1;
+        }
+      }
+      y_dims_array = y_dim.Vectorize();
+      for (int i = 0; i < x_dim.size(); ++i) {
+        x_dims_array[i + axis] = x_dim[i];
+      }
+    }
+    for (int i = 0; i < max_dim; i++) {
+      if (x_dims_array[i] == -1 || y_dims_array[i] == -1) {
+        out_dims_array[i] = -1;
+      } else {
+        out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]);
+      }
+    }
+    param_.Out->Resize(DDim(out_dims_array));
+    auto out_lod = param_.Out->mutable_lod();
+    *out_lod = param_.X->lod();
+  }
+
   return true;
 }
 
@@ -46,39 +97,39 @@ bool ElementwiseOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
   return true;
 }
 
-#ifdef LITE_WITH_TRAIN
-bool ElementwiseGradExplicitOp::CheckShape() const {
-  CHECK_OR_FALSE(param_.Y);
-  CHECK_OR_FALSE(param_.X_grad);
-  CHECK_OR_FALSE(param_.Out_grad);
-  return true;
-}
+// #ifdef LITE_WITH_TRAIN
+// bool ElementwiseGradExplicitOp::CheckShape() const {
+//  CHECK_OR_FALSE(param_.Y);
+//  CHECK_OR_FALSE(param_.X_grad);
+//  CHECK_OR_FALSE(param_.Out_grad);
+//  return true;
+//}
 
-bool ElementwiseGradExplicitOp::InferShape() const {
-  param_.X_grad->Resize(param_.Out_grad->dims());
-  if (param_.Y_grad) param_.Y_grad->Resize(param_.Y->dims());
-  return true;
-}
+// bool ElementwiseGradExplicitOp::InferShapeImpl() const {
+//   param_.X_grad->Resize(param_.Out_grad->dims());
+//   if (param_.Y_grad) param_.Y_grad->Resize(param_.Y->dims());
+//   return true;
+// }
 
-bool ElementwiseGradExplicitOp::AttachImpl(const cpp::OpDesc& opdesc,
-                                           lite::Scope* scope) {
-  CHECK_EQ(opdesc.InputArgumentNames().size(), 2UL);
-  auto Y_name = opdesc.Input("Y").front();
-  auto Out_name = opdesc.Input(framework::GradVarName("Out")).front();
-  auto X_grad = opdesc.Output(framework::GradVarName("X")).front();
+// bool ElementwiseGradExplicitOp::AttachImpl(const cpp::OpDesc& opdesc,
+//                                            lite::Scope* scope) {
+//   CHECK_EQ(opdesc.InputArgumentNames().size(), 2UL);
+//   auto Y_name = opdesc.Input("Y").front();
+//   auto Out_name = opdesc.Input(framework::GradVarName("Out")).front();
+//   auto X_grad = opdesc.Output(framework::GradVarName("X")).front();
 
-  if (opdesc.Output(framework::GradVarName("Y")).size() > 0) {
-    auto Y_grad = opdesc.Output(framework::GradVarName("Y")).front();
-    param_.Y_grad = GetMutableVar<Tensor>(scope, Y_grad);
-  }
-  param_.Y = GetVar<lite::Tensor>(scope, Y_name);
-  param_.Out_grad = GetVar<lite::Tensor>(scope, Out_name);
-  param_.X_grad = GetMutableVar<lite::Tensor>(scope, X_grad);
-  param_.axis = opdesc.GetAttr<int>("axis");
+//   if (opdesc.Output(framework::GradVarName("Y")).size() > 0) {
+//     auto Y_grad = opdesc.Output(framework::GradVarName("Y")).front();
+//     param_.Y_grad = GetMutableVar<Tensor>(scope, Y_grad);
+//   }
+//   param_.Y = GetVar<lite::Tensor>(scope, Y_name);
+//   param_.Out_grad = GetVar<lite::Tensor>(scope, Out_name);
+//   param_.X_grad = GetMutableVar<lite::Tensor>(scope, X_grad);
+//   param_.axis = opdesc.GetAttr<int>("axis");
 
-  return true;
-}
-#endif
+//   return true;
+// }
+// #endif
 
 }  // namespace operators
 }  // namespace lite
@@ -91,7 +142,9 @@ REGISTER_LITE_OP(elementwise_mul, paddle::lite::operators::ElementwiseOp);
 REGISTER_LITE_OP(elementwise_max, paddle::lite::operators::ElementwiseOp);
 REGISTER_LITE_OP(elementwise_div, paddle::lite::operators::ElementwiseOp);
 
-#ifdef LITE_WITH_TRAIN
-REGISTER_LITE_OP(elementwise_sub_grad,
-                 paddle::lite::operators::ElementwiseGradExplicitOp);
-#endif
+// #ifdef LITE_WITH_TRAIN
+// REGISTER_LITE_OP(elementwise_sub_grad,
+//                  paddle::lite::operators::ElementwiseGradExplicitOp);
+// REGISTER_LITE_OP(elementwise_add_grad,
+//                  paddle::lite::operators::ElementwiseGradExplicitOp);
+// #endif
diff --git a/lite/operators/elementwise_ops.h b/lite/operators/elementwise_ops.h
index b86d35e282c893b422677395dffe871a0d7f829b..0f1b682fa5f267dd802c5ee0e35aca8f6d68f39c 100644
--- a/lite/operators/elementwise_ops.h
+++ b/lite/operators/elementwise_ops.h
@@ -27,7 +27,7 @@ class ElementwiseOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
 
@@ -39,27 +39,29 @@ class ElementwiseOp : public OpLite {
   mutable operators::ElementwiseParam param_;
 };
 
-#ifdef LITE_WITH_TRAIN
-class ElementwiseGradExplicitOp : public OpLite {
- public:
-  explicit ElementwiseGradExplicitOp(const std::string& type) : OpLite(type) {}
+// #ifdef LITE_WITH_TRAIN
+// class ElementwiseGradExplicitOp : public OpLite {
+//  public:
+//   explicit ElementwiseGradExplicitOp(const std::string& type) : OpLite(type)
+//   {}
 
-  bool CheckShape() const override;
+//   bool CheckShape() const override;
 
-  bool InferShape() const override;
+//   bool InferShapeImpl() const override;
 
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
+//   bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
 
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
+//   void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_);
+//   }
 
-  std::string DebugString() const override {
-    return "elementwise_grad_explicit_op";
-  }
+//   std::string DebugString() const override {
+//     return "elementwise_grad_explicit_op";
+//   }
 
- private:
-  mutable operators::ElementwiseGradParam param_;
-};
-#endif
+//  private:
+//   mutable operators::ElementwiseGradParam param_;
+// };
+// #endif
 
 }  // namespace operators
 }  // namespace lite
diff --git a/lite/operators/expand_op.cc b/lite/operators/expand_op.cc
index 656e8babc022e3bb022b3c3b4bb066ea5e5d173c..8e40a3b236609b1e83b5224efb462a1f803764df 100644
--- a/lite/operators/expand_op.cc
+++ b/lite/operators/expand_op.cc
@@ -32,7 +32,7 @@ bool ExpandOpLite::CheckShape() const {
   return true;
 }
 
-bool ExpandOpLite::InferShape() const {
+bool ExpandOpLite::InferShapeImpl() const {
   DDim out_dims(param_.X->dims());
   for (size_t i = 0; i < param_.expand_times.size(); ++i) {
     out_dims[i] *= param_.expand_times[i];
diff --git a/lite/operators/expand_op.h b/lite/operators/expand_op.h
index ce5dcda9e80377699b168e6a4970a9bba0cf5039..1312df8e83747107e4c87e856c3b07fc2748d75b 100644
--- a/lite/operators/expand_op.h
+++ b/lite/operators/expand_op.h
@@ -28,7 +28,7 @@ class ExpandOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/fake_channel_wise_dequantize_max_abs.h b/lite/operators/fake_channel_wise_dequantize_max_abs.h
index 43afb7791fe617af0c7ac496cc62a12e6cc548d2..e26d5dda52f8b72d9202067a8782cf1dc10b983e 100644
--- a/lite/operators/fake_channel_wise_dequantize_max_abs.h
+++ b/lite/operators/fake_channel_wise_dequantize_max_abs.h
@@ -36,7 +36,7 @@ class FakeChannelWiseDequantizeMaxAbsOpLite : public OpLite {
 
   bool CheckShape() const override { return true; }
 
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
 
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
     auto x = op_desc.Input("X").front();
diff --git a/lite/operators/fake_dequantize_max_abs.h b/lite/operators/fake_dequantize_max_abs.h
index bc266327ebcb14da01201dcc1825367ff7ecd72e..c4bb19c04872078eb997afca6cd7a3cce6923fde 100644
--- a/lite/operators/fake_dequantize_max_abs.h
+++ b/lite/operators/fake_dequantize_max_abs.h
@@ -35,7 +35,7 @@ class FakeDequantizeMaxAbsOpLite : public OpLite {
 
   bool CheckShape() const override { return true; }
 
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
 
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
     auto x = op_desc.Input("X").front();
diff --git a/lite/operators/fake_quantize_dequantize_moving_avg_max_abs.h b/lite/operators/fake_quantize_dequantize_moving_avg_max_abs.h
index 8efa46c41501be79ccc69f4cc9f9646c11673d2d..be7ec60e0eab730c2910c3822c976d579b48d6b7 100644
--- a/lite/operators/fake_quantize_dequantize_moving_avg_max_abs.h
+++ b/lite/operators/fake_quantize_dequantize_moving_avg_max_abs.h
@@ -36,7 +36,7 @@ class FakeQuantizeDequantizeMovingAvgMaxAbsOpLite : public OpLite {
 
   bool CheckShape() const override { return true; }
 
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
 
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
     auto x = op_desc.Input("X").front();
diff --git a/lite/operators/fake_quantize_moving_avg_max_abs.h b/lite/operators/fake_quantize_moving_avg_max_abs.h
index adc62a480d2d2efec54b3822f55a9f66c278e21e..5726231f31eab2012d2cd594c5c26977c71141ff 100644
--- a/lite/operators/fake_quantize_moving_avg_max_abs.h
+++ b/lite/operators/fake_quantize_moving_avg_max_abs.h
@@ -36,7 +36,7 @@ class FakeQuantizeMovingAvgMaxAbsOpLite : public OpLite {
 
   bool CheckShape() const override { return true; }
 
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
 
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
     auto x = op_desc.Input("X").front();
diff --git a/lite/operators/fake_quantize_range_abs_max.h b/lite/operators/fake_quantize_range_abs_max.h
index f68d1e20f6e60bb5aa99a2402ea8c9f88aa18470..14f823ece2ee168ae09bc1db67f3d6a7e8c18d5d 100644
--- a/lite/operators/fake_quantize_range_abs_max.h
+++ b/lite/operators/fake_quantize_range_abs_max.h
@@ -36,7 +36,7 @@ class FakeQuantizeRangeMaxAbsOpLite : public OpLite {
 
   bool CheckShape() const override { return true; }
 
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
 
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
     auto x = op_desc.Input("X").front();
diff --git a/lite/operators/fc_op.cc b/lite/operators/fc_op.cc
index eff9300fea4caf412186bfc8d0ad136686507be5..d58a9e5b881048dd47340082fe9c94a618a7a5fb 100644
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
@@ -48,7 +48,7 @@ bool FcOpLite::CheckShape() const {
   return true;
 }
 
-bool FcOpLite::InferShape() const {
+bool FcOpLite::InferShapeImpl() const {
   const auto& input_dims = param_.input->dims();
   const auto& w_dims = param_.w->dims();
   int in_num_col_dims = param_.in_num_col_dims;
@@ -64,6 +64,7 @@ bool FcOpLite::InferShape() const {
 
   // share LoD
   param_.output->set_lod(param_.input->lod());
+
   return true;
 }
 
diff --git a/lite/operators/fc_op.h b/lite/operators/fc_op.h
index ec449cd4bdc33f191c33fc04f215ad672b283215..2e6a3ad59a1ca6d2e31f42ceb4b2d1b381c697ee 100644
--- a/lite/operators/fc_op.h
+++ b/lite/operators/fc_op.h
@@ -35,7 +35,7 @@ class FcOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
 
diff --git a/lite/operators/feed_op.cc b/lite/operators/feed_op.cc
index 8a0c75f62b6bed5767a8cc4b8348b4ca5b59eea5..c429d1f5744e50ff84a0a3d76e2f3e1ba68a0821 100644
--- a/lite/operators/feed_op.cc
+++ b/lite/operators/feed_op.cc
@@ -29,7 +29,7 @@ class FeedOp : public OpLite {
     return true;
   }
 
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
 
   void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
 
diff --git a/lite/operators/fetch_op.cc b/lite/operators/fetch_op.cc
index d50c0db34084bf8a70c9451ba0f0d8960e9d18c9..9db5fb418dab4418a0d6a622f87620c5c2673ecf 100644
--- a/lite/operators/fetch_op.cc
+++ b/lite/operators/fetch_op.cc
@@ -29,7 +29,7 @@ class FetchOp : public OpLite {
     return true;
   }
 
-  bool InferShape() const override { return true; }
+  bool InferShapeImpl() const override { return true; }
   void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
 
  protected:
diff --git a/lite/operators/fill_constant_batch_size_like_op.cc b/lite/operators/fill_constant_batch_size_like_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5b0ebb38e717afea4dabe011c0161248e2113a02
--- /dev/null
+++ b/lite/operators/fill_constant_batch_size_like_op.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/fill_constant_batch_size_like_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool FillConstantBatchSizeLikeOp::CheckShape() const {
+  CHECK(param_.out);
+  CHECK(param_.input);
+  CHECK_GT(param_.shape.size(), 0);
+  CHECK_GE(param_.input_dim_idx, 0);
+  CHECK_GE(param_.output_dim_idx, 0);
+  return true;
+}
+
+bool FillConstantBatchSizeLikeOp::InferShapeImpl() const {
+  std::vector<int64_t> output_dim{param_.shape.begin(), param_.shape.end()};
+  if (param_.input_dim_idx == 0 && !param_.input->lod().empty()) {
+    output_dim[param_.output_dim_idx] = param_.input->lod().back().size() - 1;
+  } else {
+    output_dim[param_.output_dim_idx] =
+        param_.input->dims()[param_.input_dim_idx];
+  }
+  param_.out->Resize(output_dim);
+  return true;
+}
+
+bool FillConstantBatchSizeLikeOp::AttachImpl(const cpp::OpDesc& opdesc,
+                                             lite::Scope* scope) {
+  auto out_name = opdesc.Output("Out").front();
+  auto input_name = opdesc.Input("Input").front();
+
+  param_.out = GetMutableVar<lite::Tensor>(scope, out_name);
+  param_.input = GetMutableVar<lite::Tensor>(scope, input_name);
+  param_.dtype = opdesc.GetAttr<int>("dtype");
+  param_.shape = opdesc.GetAttr<std::vector<int>>("shape");
+  if (opdesc.HasAttr("value")) {
+    param_.value = opdesc.GetAttr<float>("value");
+  }
+  if (opdesc.HasAttr("input_dim_idx")) {
+    param_.input_dim_idx = opdesc.GetAttr<int>("input_dim_idx");
+  }
+  if (opdesc.HasAttr("output_dim_idx")) {
+    param_.output_dim_idx = opdesc.GetAttr<int>("output_dim_idx");
+  }
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(fill_constant_batch_size_like,
+                 paddle::lite::operators::FillConstantBatchSizeLikeOp);
diff --git a/lite/operators/fill_constant_batch_size_like_op.h b/lite/operators/fill_constant_batch_size_like_op.h
index b073ba8379e5e52fcd3a2d0ee28aaaf5ceaea678..3c576ab28222c45aa17ba96f5e3e585624a29c02 100644
--- a/lite/operators/fill_constant_batch_size_like_op.h
+++ b/lite/operators/fill_constant_batch_size_like_op.h
@@ -32,7 +32,7 @@ class FillConstantBatchSizeLikeOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
@@ -45,6 +45,6 @@ class FillConstantBatchSizeLikeOp : public OpLite {
   mutable FillConstantBatchSizeLikeParam param_;
 };
 
-} /* namespace operators */
-} /* namespace lite */
-} /* namespace paddle */
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/fill_constant_op.cc b/lite/operators/fill_constant_op.cc
index bd4b483e9ed20f89bf2d072ca21bdc24a0e82256..565c4bbd16e01af340e728e28866268c1a845760 100644
--- a/lite/operators/fill_constant_op.cc
+++ b/lite/operators/fill_constant_op.cc
@@ -12,129 +12,69 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/core/op_lite.h"
+#include "lite/operators/fill_constant_op.h"
 #include "lite/core/op_registry.h"
 
 namespace paddle {
 namespace lite {
 namespace operators {
 
-class FillConstantOp : public OpLite {
- public:
-  explicit FillConstantOp(const std::string& type) : OpLite(type) {}
-
-  bool CheckShape() const override {
-    CHECK_OR_FALSE(param_.Out);
-    return true;
-  }
-
-  bool InferShape() const override {
-    lite::Tensor* shape_tensor_ = param_.shape_tensor;
-    if (param_.shape.empty() && shape_tensor_ != nullptr) {
-      param_.Out->Resize(shape_tensor_->dims());
-      return true;
-    }
-
-    param_.Out->Resize(param_.shape);
-    return true;
-  }
-
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override {
-    auto Out_name = opdesc.Output("Out").front();
-
-    param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
-    param_.dtype = opdesc.GetAttr<int>("dtype");
-    param_.shape = opdesc.GetAttr<std::vector<int64_t>>("shape");
-    param_.value = opdesc.GetAttr<float>("value");
-    param_.force_cpu = opdesc.GetAttr<bool>("force_cpu");
-    param_.shape_tensor = nullptr;
-    param_.shape_tensor_list = {};
-
-    std::vector<std::string> input_arg_names = opdesc.InputArgumentNames();
-    if (opdesc.HasInput("ShapeTensor") &&
-        !opdesc.Input("ShapeTensor").empty()) {
-      auto args = opdesc.Input("ShapeTensor");
-      auto* var = scope->FindVar(args.front());
-      param_.shape_tensor = var->GetMutable<lite::Tensor>();
+bool FillConstantOp::CheckShape() const {
+  CHECK(param_.out);
+  return true;
+}
+
+bool FillConstantOp::InferShapeImpl() const {
+  std::vector<int64_t> out_shape;
+  auto shape_tensor = param_.shape_tensor;
+  auto shape_tensor_list = param_.shape_tensor_list;
+  if (shape_tensor != nullptr) {
+    auto shape_tensor_data = shape_tensor->data<int>();
+    for (int i = 0; i < shape_tensor->numel(); i++) {
+      out_shape.push_back(shape_tensor_data[i]);
     }
-    if (opdesc.HasAttr("ShapeTensorList")) {
-      auto args = opdesc.Input("ShapeTensorList");
-      auto* var = scope->FindVar(args.front());
-      param_.shape_tensor_list =
-          *(var->GetMutable<std::vector<lite::Tensor*>>());
+  } else if (!shape_tensor_list.empty()) {
+    for (int i = 0; i < shape_tensor_list.size(); i++) {
+      out_shape.push_back(shape_tensor_list[i]->data<int>()[0]);
     }
-    return true;
+  } else if (!param_.shape.empty()) {
+    out_shape = param_.shape;
+  } else {
+    LOG(FATAL) << "no valid out_shape. Must set one of shape_tensor, or "
+                  "shape_tensor_list, or shape.";
   }
 
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "fill_constant"; }
-
- private:
-  mutable operators::FillConstantParam param_;
-};
+  param_.out->Resize(out_shape);
+  return true;
+}
 
-class FillConstantBatchLikeOp : public OpLite {
- public:
-  explicit FillConstantBatchLikeOp(const std::string& type) : OpLite(type) {}
+bool FillConstantOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
+  auto out_name = opdesc.Output("Out").front();
 
-  bool CheckShape() const override {
-    CHECK_OR_FALSE(param_.out);
-    CHECK_OR_FALSE(param_.input);
-    CHECK_GT_OR_FALSE(param_.shape.size(), 0);
-    CHECK_GE_OR_FALSE(param_.input_dim_idx, 0);
-    CHECK_GE_OR_FALSE(param_.output_dim_idx, 0);
-    return true;
+  param_.out = GetMutableVar<lite::Tensor>(scope, out_name);
+  param_.dtype = opdesc.GetAttr<int>("dtype");
+  if (opdesc.HasAttr("shape")) {
+    param_.shape = opdesc.GetAttr<std::vector<int64_t>>("shape");
   }
+  param_.value = opdesc.GetAttr<float>("value");
+  param_.force_cpu = opdesc.GetAttr<bool>("force_cpu");
 
-  bool InferShape() const override {
-    auto output_dim = param_.shape;
-    output_dim[param_.output_dim_idx] =
-        param_.input->dims()[param_.input_dim_idx];
-    param_.out->Resize(output_dim);
-    return true;
+  if (opdesc.HasInput("ShapeTensor") && !opdesc.Input("ShapeTensor").empty()) {
+    auto shape_tensor_name = opdesc.Input("ShapeTensor").front();
+    param_.shape_tensor = GetMutableVar<lite::Tensor>(scope, shape_tensor_name);
   }
-
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override {
-    auto Out_name = opdesc.Output("Out").front();
-    auto In_name = opdesc.Input("Input").front();
-
-    param_.out = GetMutableVar<lite::Tensor>(scope, Out_name);
-    param_.input = GetMutableVar<lite::Tensor>(scope, In_name);
-    param_.dtype = opdesc.GetAttr<int>("dtype");
-    auto shape = opdesc.GetAttr<std::vector<int>>("shape");
-    std::vector<int64_t> outshape;
-    for (auto i : shape) {
-      outshape.push_back(i);
-    }
-    param_.shape = outshape;
-    if (opdesc.HasAttr("value")) {
-      param_.value = opdesc.GetAttr<float>("value");
-    }
-    if (opdesc.HasAttr("input_dim_idx")) {
-      param_.input_dim_idx = opdesc.GetAttr<int>("input_dim_idx");
-    }
-    if (opdesc.HasAttr("output_dim_idx")) {
-      param_.output_dim_idx = opdesc.GetAttr<int>("output_dim_idx");
+  if (opdesc.HasInput("ShapeTensorList") &&
+      !opdesc.Input("ShapeTensorList").empty()) {
+    for (auto shape_tensor_name : opdesc.Input("ShapeTensorList")) {
+      param_.shape_tensor_list.push_back(
+          GetMutableVar<lite::Tensor>(scope, shape_tensor_name));
     }
-
-    return true;
   }
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override {
-    return "fill_constant_batch_size_like";
-  }
-
- private:
-  mutable operators::FillConstantBatchLikeParam param_;
-};
+  return true;
+}
 
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
 
 REGISTER_LITE_OP(fill_constant, paddle::lite::operators::FillConstantOp);
-REGISTER_LITE_OP(fill_constant_batch_size_like,
-                 paddle::lite::operators::FillConstantBatchLikeOp);
diff --git a/lite/operators/fill_constant_op.h b/lite/operators/fill_constant_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c0500898bef45efc7a72bc68c82fca9036c63f4
--- /dev/null
+++ b/lite/operators/fill_constant_op.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class FillConstantOp : public OpLite {
+ public:
+  FillConstantOp() {}
+
+  explicit FillConstantOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "fill_constant"; }
+
+ private:
+  mutable FillConstantParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/flatten_op.cc b/lite/operators/flatten_op.cc
index 6deab45023876b1a5707ef5cea6ec69af3875328..b270dbf52f9a19f574e6f8967ff93e3a013e5737 100644
--- a/lite/operators/flatten_op.cc
+++ b/lite/operators/flatten_op.cc
@@ -25,7 +25,7 @@ bool FlattenOp::CheckShape() const {
   return true;
 }
 
-bool FlattenOp::InferShape() const {
+bool FlattenOp::InferShapeImpl() const {
   auto x_dims = param_.x->dims();
 
   auto out_lod = param_.output->mutable_lod();
@@ -71,8 +71,8 @@ bool Flatten2Op::CheckShape() const {
   return true;
 }
 
-bool Flatten2Op::InferShape() const {
-  FlattenOp::InferShape();
+bool Flatten2Op::InferShapeImpl() const {
+  FlattenOp::InferShapeImpl();
   auto x_dims = param_.x->dims();
   std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1, 0);
   for (size_t i = 0; i < x_dims.size(); i++) {
diff --git a/lite/operators/flatten_op.h b/lite/operators/flatten_op.h
index 61680fd3903b77f8826cda6f6a242739720155d7..78b803d765c8513ead9bf482bf23914ac4bf3430 100644
--- a/lite/operators/flatten_op.h
+++ b/lite/operators/flatten_op.h
@@ -30,7 +30,7 @@ class FlattenOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
@@ -49,7 +49,7 @@ class Flatten2Op : public FlattenOp {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/fusion_elementwise_activation_ops.cc b/lite/operators/fusion_elementwise_activation_ops.cc
index b82c6454b4d8e9ca1af45374ea05925cbbadf0ed..dfe3bda6c65a75f8b0f8a080d9dc367fb493e6f2 100644
--- a/lite/operators/fusion_elementwise_activation_ops.cc
+++ b/lite/operators/fusion_elementwise_activation_ops.cc
@@ -27,7 +27,7 @@ bool FusionElementwiseActivationOp::CheckShape() const {
   return true;
 }
 
-bool FusionElementwiseActivationOp::InferShape() const {
+bool FusionElementwiseActivationOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.X->dims().size() >= param_.Y->dims().size());
   param_.Out->Resize(param_.X->dims());
   return true;
@@ -59,7 +59,7 @@ bool FusionElementwiseActivationOp::AttachImpl(const cpp::OpDesc& opdesc,
 //   return true;
 // }
 
-// bool FusionElementwiseActivationGradExplicitOp::InferShape() const {
+// bool FusionElementwiseActivationGradExplicitOp::InferShapeImpl() const {
 //   param_.X_grad->Resize(param_.Out_grad->dims());
 //   param_.Y_grad->Resize(param_.Y->dims());
 //   return true;
@@ -100,8 +100,8 @@ REGISTER_LITE_OP(fusion_elementwise_max_activation,
 REGISTER_LITE_OP(fusion_elementwise_div_activation,
                  paddle::lite::operators::FusionElementwiseActivationOp);
 
-#ifdef LITE_WITH_TRAIN
-REGISTER_LITE_OP(
-    fusion_elementwise_sub_activation_grad,
-    paddle::lite::operators::FusionElementwiseActivationGradExplicitOp);
-#endif
+// #ifdef LITE_WITH_TRAIN
+// REGISTER_LITE_OP(
+//     fusion_elementwise_sub_activation_grad,
+//     paddle::lite::operators::FusionElementwiseActivationGradExplicitOp);
+// #endif
diff --git a/lite/operators/fusion_elementwise_activation_ops.h b/lite/operators/fusion_elementwise_activation_ops.h
index 1999ebd7220c81c313492ae106812c0eb755cb6e..738c2168225d86f4614ba8eaaa6c6354f038116c 100644
--- a/lite/operators/fusion_elementwise_activation_ops.h
+++ b/lite/operators/fusion_elementwise_activation_ops.h
@@ -29,7 +29,7 @@ class FusionElementwiseActivationOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
 
@@ -43,28 +43,29 @@ class FusionElementwiseActivationOp : public OpLite {
   mutable operators::FusionElementwiseActivationParam param_;
 };
 
-#ifdef LITE_WITH_TRAIN
-class FusionElementwiseActivationGradExplicitOp : public OpLite {
- public:
-  explicit FusionElementwiseActivationGradExplicitOp(const std::string& type)
-      : OpLite(type) {}
+// #ifdef LITE_WITH_TRAIN
+// class FusionElementwiseActivationGradExplicitOp : public OpLite {
+//  public:
+//   explicit FusionElementwiseActivationGradExplicitOp(const std::string& type)
+//       : OpLite(type) {}
 
-  bool CheckShape() const override;
+//   bool CheckShape() const override;
 
-  bool InferShape() const override;
+//   bool InferShapeImpl() const override;
 
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
+//   bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
 
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
+//   void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_);
+//   }
 
-  std::string DebugString() const override {
-    return "fusion_elementwise_activation_grad_explicit_op";
-  }
+//   std::string DebugString() const override {
+//     return "fusion_elementwise_activation_grad_explicit_op";
+//   }
 
- private:
-  mutable operators::FusionElementwiseActivationGradParam param_;
-};
-#endif
+//  private:
+//   mutable operators::FusionElementwiseActivationGradParam param_;
+// };
+// #endif
 
 }  // namespace operators
 }  // namespace lite
diff --git a/lite/operators/gather_op.cc b/lite/operators/gather_op.cc
index 6de2e97a3c079e373e8747dba4c1c1d4779aa70a..670cd61c8ea5af2f29a908b5d49bedccaff93c0a 100644
--- a/lite/operators/gather_op.cc
+++ b/lite/operators/gather_op.cc
@@ -26,7 +26,7 @@ bool GatherOp::CheckShape() const {
   return true;
 }
 
-bool GatherOp::InferShape() const {
+bool GatherOp::InferShapeImpl() const {
   auto index_dims = param_.Index->dims();
   CHECK(index_dims.size() == 1 ||
         (index_dims.size() == 2 && index_dims[1] == 1))
@@ -39,15 +39,12 @@ bool GatherOp::InferShape() const {
 }
 
 bool GatherOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.X =
-      scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  param_.Index =
-      scope->FindVar(opdesc.Input("Index").front())->GetMutable<lite::Tensor>();
+  param_.X = scope->FindTensor(opdesc.Input("X").front());
+  param_.Index = scope->FindTensor(opdesc.Input("Index").front());
+  param_.Out = scope->FindMutableTensor(opdesc.Output("Out").front());
   CHECK(param_.X) << "X is null";
-  CHECK(param_.Out) << "out is null";
   CHECK(param_.Index) << "index is null";
+  CHECK(param_.Out) << "out is null";
   return true;
 }
 
diff --git a/lite/operators/gather_op.h b/lite/operators/gather_op.h
index 58d5a30ffbb5f563503c8934d8c9e40bb539d5df..d2072c3a6d6e6e0b100ab3bb9413da8cd4f51f6b 100644
--- a/lite/operators/gather_op.h
+++ b/lite/operators/gather_op.h
@@ -30,7 +30,7 @@ class GatherOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/generate_proposals_op.cc b/lite/operators/generate_proposals_op.cc
index a29ef65e97ccfdaaaf20d6cbbb411fc69cee6f54..48e709c348974dcf1868a7a17425b4168f04b4f6 100644
--- a/lite/operators/generate_proposals_op.cc
+++ b/lite/operators/generate_proposals_op.cc
@@ -43,7 +43,7 @@ bool GenerateProposalsOpLite::CheckShape() const {
   return true;
 }
 
-bool GenerateProposalsOpLite::InferShape() const {
+bool GenerateProposalsOpLite::InferShapeImpl() const {
   param_.RpnRois->Resize(std::vector<int64_t>({-1, 4}));
   param_.RpnRoiProbs->Resize(std::vector<int64_t>({-1, 1}));
   return true;
diff --git a/lite/operators/generate_proposals_op.h b/lite/operators/generate_proposals_op.h
index 502bcca1a3276fbbcc2f05bf8b38fcf2d1bbb024..35dee1966bda7cd9e865f42113c7a92061a3782a 100644
--- a/lite/operators/generate_proposals_op.h
+++ b/lite/operators/generate_proposals_op.h
@@ -32,7 +32,7 @@ class GenerateProposalsOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/grid_sampler_op.cc b/lite/operators/grid_sampler_op.cc
index 2b13d17da7c439f582f682a74b1590cda632cf78..97e2b36a6bcd0eb784a39ab4f2a2e0703d7a7c93 100644
--- a/lite/operators/grid_sampler_op.cc
+++ b/lite/operators/grid_sampler_op.cc
@@ -42,7 +42,7 @@ bool GridSamplerOp::CheckShape() const {
   return true;
 }
 
-bool GridSamplerOp::InferShape() const {
+bool GridSamplerOp::InferShapeImpl() const {
   auto x_dims = param_.x->dims();
   param_.out->Resize(x_dims);
   return true;
diff --git a/lite/operators/grid_sampler_op.h b/lite/operators/grid_sampler_op.h
index 035e1b834510affefacafad763d75d6fbf53aed9..2fba4fe69311c274765e9db4c9b27e137c78a3ee 100644
--- a/lite/operators/grid_sampler_op.h
+++ b/lite/operators/grid_sampler_op.h
@@ -31,7 +31,7 @@ class GridSamplerOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/gru_op.cc b/lite/operators/gru_op.cc
index eb97d65a1a213e31b23087d1ca5c8e963ecf9bbb..862a1ff98f699393c9aa91afab978f947cc25187 100644
--- a/lite/operators/gru_op.cc
+++ b/lite/operators/gru_op.cc
@@ -51,7 +51,7 @@ bool GRUOpLite::CheckShape() const {
   return true;
 }
 
-bool GRUOpLite::InferShape() const {
+bool GRUOpLite::InferShapeImpl() const {
   const auto& input_dims = param_.input->dims();
   const auto& weight_dims = param_.weight->dims();
   int frame_size = weight_dims[0];
diff --git a/lite/operators/gru_op.h b/lite/operators/gru_op.h
index c43f32f0cd41b8fa9bc8a541c48523a4f120009d..34f87fa79371fc3d798a57b4aae0945a27a692c3 100644
--- a/lite/operators/gru_op.h
+++ b/lite/operators/gru_op.h
@@ -30,7 +30,7 @@ class GRUOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/gru_unit_op.cc b/lite/operators/gru_unit_op.cc
index ed33507fc3fa61fce1e718581309ae37992c0531..ad025fbbc19cf27f053d5cc2bda566f186a72529 100644
--- a/lite/operators/gru_unit_op.cc
+++ b/lite/operators/gru_unit_op.cc
@@ -51,7 +51,7 @@ bool GRUUnitOpLite::CheckShape() const {
   return true;
 }
 
-bool GRUUnitOpLite::InferShape() const {
+bool GRUUnitOpLite::InferShapeImpl() const {
   auto input_dims = param_.input->dims();
   auto hidden_prev_dims = param_.hidden_prev->dims();
   auto weight_dims = param_.weight->dims();
diff --git a/lite/operators/gru_unit_op.h b/lite/operators/gru_unit_op.h
index 301a7e7323afaea16dce2adcb356a41a8b0b8cac..2785e60e95b0f36cc5bf92714af857ef658d80dc 100644
--- a/lite/operators/gru_unit_op.h
+++ b/lite/operators/gru_unit_op.h
@@ -30,7 +30,7 @@ class GRUUnitOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/im2sequence_op.cc b/lite/operators/im2sequence_op.cc
index 40ab2106af85b3386f93385785b65b9293b1c7f9..ae7b1029468ddb9f723de522ce715859d9a08a09 100644
--- a/lite/operators/im2sequence_op.cc
+++ b/lite/operators/im2sequence_op.cc
@@ -26,7 +26,7 @@ inline int Im2SeqOutputSize(
 }
 
 bool Im2SequenceOp::CheckShape() const { return true; }
-bool Im2SequenceOp::InferShape() const {
+bool Im2SequenceOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   auto input_dims = param_.X->dims();
diff --git a/lite/operators/im2sequence_op.h b/lite/operators/im2sequence_op.h
index 83a347c913fd80c3a890053e1e1945b6cf2a7cd4..62525baaf071bb92b79773c248adb4fd1c798d90 100644
--- a/lite/operators/im2sequence_op.h
+++ b/lite/operators/im2sequence_op.h
@@ -30,7 +30,7 @@ class Im2SequenceOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/increment_op.cc b/lite/operators/increment_op.cc
index 55e387cfa2af9c4a2f6121f33335c780ba620fe0..9b34e4f73b8cc0e27cab06547d3fab84c7033b88 100644
--- a/lite/operators/increment_op.cc
+++ b/lite/operators/increment_op.cc
@@ -25,7 +25,7 @@ bool IncrementOp::CheckShape() const {
   return true;
 }
 
-bool IncrementOp::InferShape() const {
+bool IncrementOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   auto out_dims = param_.X->dims();
@@ -34,10 +34,8 @@ bool IncrementOp::InferShape() const {
 }
 
 bool IncrementOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.X =
-      scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+  param_.X = scope->FindMutableTensor(opdesc.Input("X").front());
+  param_.Out = scope->FindMutableTensor(opdesc.Output("Out").front());
   CHECK(param_.X);
   CHECK(param_.Out);
   param_.step = opdesc.GetAttr<float>("step");
diff --git a/lite/operators/increment_op.h b/lite/operators/increment_op.h
index f180d527c31494dcfb8cb53f005861ae639c9844..d4e6fd6b1ff1aea47df130d510bc84ab0a0b6019 100644
--- a/lite/operators/increment_op.h
+++ b/lite/operators/increment_op.h
@@ -30,7 +30,7 @@ class IncrementOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/instance_norm_op.cc b/lite/operators/instance_norm_op.cc
index 510402ba1fb363f383b3cba8eb322a4ff7975c18..5f685ccfc59a7170a2d29d2b8e561ed933c8517c 100644
--- a/lite/operators/instance_norm_op.cc
+++ b/lite/operators/instance_norm_op.cc
@@ -42,7 +42,7 @@ bool InstanceNormOp::CheckShape() const {
   return true;
 }
 
-bool InstanceNormOp::InferShape() const {
+bool InstanceNormOp::InferShapeImpl() const {
   auto x_dims = param_.x->dims();
   int64_t batch_size = x_dims[0];
   int64_t channel_size = x_dims[1];
diff --git a/lite/operators/instance_norm_op.h b/lite/operators/instance_norm_op.h
index d128345805cf77ac2a4123a8549c92051593fff0..94a1f69fa4433072a986f1d82d5f1b8401a03386 100644
--- a/lite/operators/instance_norm_op.h
+++ b/lite/operators/instance_norm_op.h
@@ -31,7 +31,7 @@ class InstanceNormOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/interpolate_op.cc b/lite/operators/interpolate_op.cc
index 1bfb20df4e4b9762e93b6a39f0d34eb2521acfe0..0ef22e42903842ac41e9aca010f78796b5a32fcc 100644
--- a/lite/operators/interpolate_op.cc
+++ b/lite/operators/interpolate_op.cc
@@ -34,7 +34,7 @@ bool InterpolateOp::CheckShape() const {
   return true;
 }
 
-bool InterpolateOp::InferShape() const {
+bool InterpolateOp::InferShapeImpl() const {
   auto X = param_.X;
 
   int n = X->dims()[0];
diff --git a/lite/operators/interpolate_op.h b/lite/operators/interpolate_op.h
index 5fcf4ef594d52a4ac14e5545b195cc51cbf379cf..2bc938964811c57189e45d3b9d892542f9f02e8f 100644
--- a/lite/operators/interpolate_op.h
+++ b/lite/operators/interpolate_op.h
@@ -31,7 +31,7 @@ class InterpolateOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/io_copy_op.cc b/lite/operators/io_copy_op.cc
index f7e72a6e1e1ecb01e866fece1a09d7b9c4e7a695..05b2d3d800d2d2989ae23f9a1ccac57021e82ac1 100644
--- a/lite/operators/io_copy_op.cc
+++ b/lite/operators/io_copy_op.cc
@@ -24,7 +24,7 @@ bool IoCopyOp::CheckShape() const {
   CHECK_OR_FALSE(param_.y);
   return true;
 }
-bool IoCopyOp::InferShape() const {
+bool IoCopyOp::InferShapeImpl() const {
   param_.y->Resize(param_.x->dims());
   return true;
 }
@@ -35,6 +35,9 @@ bool IoCopyOp::AttachImpl(const cpp::OpDesc &opdesc,
   auto out = opdesc.Output("Out").front();
   param_.x = GetTensor(scope, x);
   param_.y = GetMutableTensor(scope, out);
+  if (opdesc.HasAttr("process_type")) {
+    param_.process_type = opdesc.GetAttr<int>("process_type");
+  }
   return true;
 }
 std::string IoCopyOp::DebugString() const { return "io_copy_op"; }
diff --git a/lite/operators/io_copy_op.h b/lite/operators/io_copy_op.h
index 8d6d69d63ed8b7ec289d7935ea28df2482e0cf31..d6922b667d78e3b79a005aae895b9e63dc76fa21 100644
--- a/lite/operators/io_copy_op.h
+++ b/lite/operators/io_copy_op.h
@@ -24,7 +24,7 @@ class IoCopyOp : public OpLite {
  public:
   explicit IoCopyOp(const std::string &type) : OpLite(type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool Run() override;
   std::string DebugString() const override;
 
diff --git a/lite/operators/is_empty_op.cc b/lite/operators/is_empty_op.cc
index ed4c69e64eaae8fdcb8289c5389dcff1df2ea8b5..a62470e4bb7f88d4c441dc8814bba7c4913ab3e4 100644
--- a/lite/operators/is_empty_op.cc
+++ b/lite/operators/is_empty_op.cc
@@ -21,7 +21,7 @@ namespace operators {
 
 bool IsEmptyOp::CheckShape() const { return true; }
 
-bool IsEmptyOp::InferShape() const { return true; }
+bool IsEmptyOp::InferShapeImpl() const { return true; }
 
 bool IsEmptyOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   param_.X =
diff --git a/lite/operators/is_empty_op.h b/lite/operators/is_empty_op.h
index 5bfa0905c7c57110473fde48d78d17947abbb547..14c0830c233a9ff011b00d130bc36054a7ede57a 100644
--- a/lite/operators/is_empty_op.h
+++ b/lite/operators/is_empty_op.h
@@ -30,7 +30,7 @@ class IsEmptyOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/layer_norm_op.cc b/lite/operators/layer_norm_op.cc
index 18ea6cbf281846600273d6e7d462ed43f2e45637..2f50d232e3781e44b8203084382c20872094a263 100644
--- a/lite/operators/layer_norm_op.cc
+++ b/lite/operators/layer_norm_op.cc
@@ -27,7 +27,7 @@ bool LayerNormOp::CheckShape() const {
   return true;
 }
 
-bool LayerNormOp::InferShape() const {
+bool LayerNormOp::InferShapeImpl() const {
   auto out_dims = param_.X->dims();
   param_.Y->Resize(out_dims);
   auto inner_size = out_dims.Flatten2D(param_.begin_norm_axis)[0];
diff --git a/lite/operators/layer_norm_op.h b/lite/operators/layer_norm_op.h
index 297f6bdd402b919b4baa1915135ed909c57cfa0b..6e15d2f599beb14df024f2591b098b128c3af8dd 100644
--- a/lite/operators/layer_norm_op.h
+++ b/lite/operators/layer_norm_op.h
@@ -30,7 +30,7 @@ class LayerNormOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/layout_op.cc b/lite/operators/layout_op.cc
index f27f1ced5518f9dc67ddcd8740d2d9ee1b2e4341..d71dab68702ddd53af1540c2a6dce14d43b27e09 100644
--- a/lite/operators/layout_op.cc
+++ b/lite/operators/layout_op.cc
@@ -24,7 +24,7 @@ bool LayoutOp::CheckShape() const {
   CHECK_OR_FALSE(param_.y);
   return true;
 }
-bool LayoutOp::InferShape() const {
+bool LayoutOp::InferShapeImpl() const {
   param_.y->Resize(param_.x->dims());
   return true;
 }
@@ -35,6 +35,9 @@ bool LayoutOp::AttachImpl(const cpp::OpDesc &opdesc,
   auto out = opdesc.Output("Out").front();
   param_.x = GetTensor(scope, x);
   param_.y = GetMutableTensor(scope, out);
+  if (opdesc.HasAttr("process_type")) {
+    param_.process_type = opdesc.GetAttr<int>("process_type");
+  }
   return true;
 }
 std::string LayoutOp::DebugString() const { return "layout_op"; }
diff --git a/lite/operators/layout_op.h b/lite/operators/layout_op.h
index 216d571d7c37204ec6ef6c513caba726841bcdf2..f51768863bf2e942262f364c271b902922b39cb1 100644
--- a/lite/operators/layout_op.h
+++ b/lite/operators/layout_op.h
@@ -24,7 +24,7 @@ class LayoutOp : public OpLite {
  public:
   explicit LayoutOp(const std::string &type) : OpLite(type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool Run() override;
   std::string DebugString() const override;
 
diff --git a/lite/operators/lod_reset_op.cc b/lite/operators/lod_reset_op.cc
index 1754e709ff2439462e8f40d047f5594ed740e07a..c30c78bbc6c1300660c01e6219c9e5113c39a718 100644
--- a/lite/operators/lod_reset_op.cc
+++ b/lite/operators/lod_reset_op.cc
@@ -25,7 +25,7 @@ bool LodResetOp::CheckShape() const {
   return true;
 }
 
-bool LodResetOp::InferShape() const {
+bool LodResetOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   param_.Out->Resize(param_.X->dims());
diff --git a/lite/operators/lod_reset_op.h b/lite/operators/lod_reset_op.h
index 4e048a9a696c3e1e4a366c732bb269134c9d5d06..8ca2bc578099aabfe6c9649d58e9caeabea7870f 100644
--- a/lite/operators/lod_reset_op.h
+++ b/lite/operators/lod_reset_op.h
@@ -30,7 +30,7 @@ class LodResetOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/logical_op.cc b/lite/operators/logical_op.cc
index 8af982ad535192f4897ea70cdb180b230d29dfd6..2dd5b798280ef80a54d557e449beee15959971b8 100644
--- a/lite/operators/logical_op.cc
+++ b/lite/operators/logical_op.cc
@@ -26,7 +26,7 @@ bool BinaryLogicalOp::CheckShape() const {
   return true;
 }
 
-bool BinaryLogicalOp::InferShape() const {
+bool BinaryLogicalOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   auto input_dims = param_.X->dims();
@@ -53,7 +53,7 @@ bool UnaryLogicalOp::CheckShape() const {
   return true;
 }
 
-bool UnaryLogicalOp::InferShape() const {
+bool UnaryLogicalOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   auto input_dims = param_.X->dims();
diff --git a/lite/operators/logical_op.h b/lite/operators/logical_op.h
index a0fc1d68a60a0650179f66ca9fd443e96a483c34..e784d4d99b7de29593e411db9b6a888e5bd52e21 100644
--- a/lite/operators/logical_op.h
+++ b/lite/operators/logical_op.h
@@ -30,7 +30,7 @@ class BinaryLogicalOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
@@ -49,7 +49,7 @@ class UnaryLogicalOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/lookup_table_dequant_op.cc b/lite/operators/lookup_table_dequant_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..844544dfad3c535342169d08159a80484a29643d
--- /dev/null
+++ b/lite/operators/lookup_table_dequant_op.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/lookup_table_dequant_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool LookupTableDequantOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.W)
+  CHECK_OR_FALSE(param_.Ids)
+  CHECK_OR_FALSE(param_.Out)
+
+  const auto& table_dims = param_.W->dims();
+  const auto& ids_dims = param_.Ids->dims();
+
+  int ids_rank = ids_dims.size();
+
+  CHECK_EQ_OR_FALSE(table_dims.size(), 2);
+  CHECK_EQ_OR_FALSE(ids_dims[ids_rank - 1], 1);
+  CHECK_GT_OR_FALSE(table_dims[1], 2);
+  return true;
+}
+
+bool LookupTableDequantOpLite::InferShapeImpl() const {
+  const auto& table_dims = param_.W->dims();
+  const auto& ids_dims = param_.Ids->dims();
+
+  auto out_dims = ids_dims;
+  int ids_rank = ids_dims.size();
+  out_dims[ids_rank - 1] = (table_dims[1] - 2) * 4;
+
+  param_.Out->Resize(out_dims);
+  param_.Out->set_lod(param_.Ids->lod());
+  return true;
+}
+
+bool LookupTableDequantOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                          lite::Scope* scope) {
+  auto input = op_desc.Input("W").front();
+  auto ids = op_desc.Input("Ids").front();
+  auto out = op_desc.Output("Out").front();
+
+  param_.W = scope->FindVar(input)->GetMutable<lite::Tensor>();
+  param_.Ids = scope->FindVar(ids)->GetMutable<lite::Tensor>();
+  param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+
+  param_.padding_idx = op_desc.GetAttr<int64_t>("padding_idx");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(lookup_table_dequant,
+                 paddle::lite::operators::LookupTableDequantOpLite)
diff --git a/lite/operators/lookup_table_dequant_op.h b/lite/operators/lookup_table_dequant_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a094cac9a49891294ec71194d39a023867f58052
--- /dev/null
+++ b/lite/operators/lookup_table_dequant_op.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class LookupTableDequantOpLite : public OpLite {
+ public:
+  LookupTableDequantOpLite() {}
+  explicit LookupTableDequantOpLite(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "LookupTableDequant"; }
+
+ private:
+  mutable LookupTableDequantParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/lookup_table_op.cc b/lite/operators/lookup_table_op.cc
index 931894d925aa9e66b34b3577304828424bfd194e..9bc22080bfb6c0ebda28e620dd9b781ec515ecbb 100644
--- a/lite/operators/lookup_table_op.cc
+++ b/lite/operators/lookup_table_op.cc
@@ -36,7 +36,7 @@ bool LookupTableOpLite::CheckShape() const {
   return true;
 }
 
-bool LookupTableOpLite::InferShape() const {
+bool LookupTableOpLite::InferShapeImpl() const {
   const auto& table_dims = param_.W->dims();
   const auto& ids_dims = param_.Ids->dims();
 
@@ -55,9 +55,9 @@ bool LookupTableOpLite::AttachImpl(const cpp::OpDesc& op_desc,
   auto ids = op_desc.Input("Ids").front();
   auto out = op_desc.Output("Out").front();
 
-  param_.W = scope->FindVar(input)->GetMutable<lite::Tensor>();
-  param_.Ids = scope->FindVar(ids)->GetMutable<lite::Tensor>();
-  param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+  param_.W = scope->FindTensor(input);
+  param_.Ids = scope->FindTensor(ids);
+  param_.Out = scope->FindMutableTensor(out);
 
   param_.padding_idx = op_desc.GetAttr<int64_t>("padding_idx");
 
diff --git a/lite/operators/lookup_table_op.h b/lite/operators/lookup_table_op.h
index 2701af984088cfda450f98fa5bc432dad7c2bc59..91ef77cfa1852a93d3aa28aceb616eec3306af3a 100644
--- a/lite/operators/lookup_table_op.h
+++ b/lite/operators/lookup_table_op.h
@@ -30,7 +30,7 @@ class LookupTableOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/lookup_table_v2_op.cc b/lite/operators/lookup_table_v2_op.cc
index c783695163b1d95964ac1a8a9d79d7167811261a..8c76090df385ca5adf454ac1918c11c8838695f1 100644
--- a/lite/operators/lookup_table_v2_op.cc
+++ b/lite/operators/lookup_table_v2_op.cc
@@ -32,7 +32,7 @@ bool LookupTableV2OpLite::CheckShape() const {
   return true;
 }
 
-bool LookupTableV2OpLite::InferShape() const {
+bool LookupTableV2OpLite::InferShapeImpl() const {
   auto table_dims = param_.W->dims();
   auto ids_dims = param_.Ids->dims();
 
@@ -52,9 +52,9 @@ bool LookupTableV2OpLite::AttachImpl(const cpp::OpDesc &op_desc,
   auto ids = op_desc.Input("Ids").front();
   auto out = op_desc.Output("Out").front();
 
-  param_.W = scope->FindVar(input)->GetMutable<lite::Tensor>();
-  param_.Ids = scope->FindVar(ids)->GetMutable<lite::Tensor>();
-  param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+  param_.W = scope->FindTensor(input);
+  param_.Ids = scope->FindTensor(ids);
+  param_.Out = scope->FindMutableTensor(out);
 
   param_.padding_idx = op_desc.GetAttr<int64_t>("padding_idx");
 
diff --git a/lite/operators/lookup_table_v2_op.h b/lite/operators/lookup_table_v2_op.h
index dabff3f0cac75cb70cde6eb6e95df34dc36901fe..b0b8829fe6aeaf02a445109ea804266758919822 100644
--- a/lite/operators/lookup_table_v2_op.h
+++ b/lite/operators/lookup_table_v2_op.h
@@ -30,7 +30,7 @@ class LookupTableV2OpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/lrn_op.cc b/lite/operators/lrn_op.cc
index aff3e5af5566771411acf20736fdbec703f5def9..dcaffe1aa7cbc64c26dd2d56fcaa650e1599eb10 100644
--- a/lite/operators/lrn_op.cc
+++ b/lite/operators/lrn_op.cc
@@ -27,7 +27,7 @@ bool LrnOpLite::CheckShape() const {
   return true;
 }
 
-bool LrnOpLite::InferShape() const {
+bool LrnOpLite::InferShapeImpl() const {
   param_.Out->Resize(param_.X->dims());
   return true;
 }
diff --git a/lite/operators/lrn_op.h b/lite/operators/lrn_op.h
index a569a77fb40d7ea60e9e41171e73668e499684a5..13dfdefdc6f28dc289f490340faa14c166485db0 100644
--- a/lite/operators/lrn_op.h
+++ b/lite/operators/lrn_op.h
@@ -28,7 +28,7 @@ class LrnOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/lstm_op.cc b/lite/operators/lstm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9b6ebfc321190286d27272ea7b09a2a751cd9f1
--- /dev/null
+++ b/lite/operators/lstm_op.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/lstm_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool LstmOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.Input);
+  CHECK_OR_FALSE(param_.Weight);
+  CHECK_OR_FALSE(param_.Bias);
+  return true;
+}
+
+bool LstmOp::InferShapeImpl() const {
+  auto in_dims = param_.Input->dims();
+  if (param_.H0) {
+    CHECK(param_.C0) << "lstm must has H0 and C0 in the same time";
+    auto h_dims = param_.H0->dims();
+    auto c_dims = param_.C0->dims();
+    CHECK_EQ(h_dims, c_dims) << "H0 and C0 dims must be same";
+  }
+  int frame_size = in_dims[1] / 4;
+  auto w_dims = param_.Weight->dims();
+  CHECK_EQ(w_dims.size(), 2) << "weight dims should be 2";
+  CHECK_EQ(w_dims[0], frame_size) << "weight first dims should be "
+                                  << frame_size;
+  CHECK_EQ(w_dims[1], 4 * frame_size) << "weight dims should be 4 * "
+                                      << frame_size;
+  auto b_dims = param_.Bias->dims();
+  CHECK_EQ(b_dims.size(), 2) << "Bias dims should be 2";
+  CHECK_EQ(b_dims[0], 1) << "Bias first dims should be 1";
+  if (param_.use_peepholes) {
+    CHECK_EQ(b_dims[1], 7 * frame_size) << "Bias second dim must be 7 * "
+                                        << frame_size;
+  } else {
+    CHECK_EQ(b_dims[1], 4 * frame_size) << "Bias second dim must be 4 * "
+                                        << frame_size;
+  }
+  DDimLite out_dims(std::vector<int64_t>{in_dims[0], frame_size});
+  param_.Hidden->Resize(out_dims);
+  param_.Cell->Resize(out_dims);
+  param_.BatchCellPreAct->Resize(out_dims);
+  param_.BatchGate->Resize(in_dims);
+
+  auto hidden_lod = param_.Hidden->mutable_lod();
+  *hidden_lod = param_.Input->lod();
+  auto cell_lod = param_.Cell->mutable_lod();
+  *cell_lod = param_.Input->lod();
+  return true;
+}
+
+bool LstmOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  param_.Input =
+      scope->FindVar(opdesc.Input("Input").front())->GetMutable<lite::Tensor>();
+  param_.Weight = scope->FindVar(opdesc.Input("Weight").front())
+                      ->GetMutable<lite::Tensor>();
+  param_.Bias =
+      scope->FindVar(opdesc.Input("Bias").front())->GetMutable<lite::Tensor>();
+  param_.Hidden = scope->FindVar(opdesc.Output("Hidden").front())
+                      ->GetMutable<lite::Tensor>();
+  param_.Cell =
+      scope->FindVar(opdesc.Output("Cell").front())->GetMutable<lite::Tensor>();
+  param_.BatchGate = scope->FindVar(opdesc.Output("BatchGate").front())
+                         ->GetMutable<lite::Tensor>();
+  param_.BatchCellPreAct =
+      scope->FindVar(opdesc.Output("BatchCellPreAct").front())
+          ->GetMutable<lite::Tensor>();
+  CHECK(param_.Input);
+  CHECK(param_.Weight);
+  CHECK(param_.Bias);
+  if (opdesc.Input("C0").size()) {
+    param_.C0 =
+        scope->FindVar(opdesc.Input("C0").front())->GetMutable<lite::Tensor>();
+  }
+  if (opdesc.Input("H0").size()) {
+    param_.H0 =
+        scope->FindVar(opdesc.Input("H0").front())->GetMutable<lite::Tensor>();
+  }
+  param_.use_peepholes = opdesc.GetAttr<bool>("use_peepholes");
+  param_.is_reverse = opdesc.GetAttr<bool>("is_reverse");
+  param_.gate_activation = opdesc.GetAttr<std::string>("gate_activation");
+  param_.cell_activation = opdesc.GetAttr<std::string>("cell_activation");
+  param_.candidate_activation =
+      opdesc.GetAttr<std::string>("candidate_activation");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(lstm, paddle::lite::operators::LstmOp);
diff --git a/lite/operators/lstm_op.h b/lite/operators/lstm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..38bef385da67defa4e3459cfbcb6cbf24e0f2ed9
--- /dev/null
+++ b/lite/operators/lstm_op.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class LstmOp : public OpLite {
+ public:
+  LstmOp() {}
+  explicit LstmOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "lstm"; }
+
+ private:
+  mutable LstmParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/match_matrix_tensor_op.cc b/lite/operators/match_matrix_tensor_op.cc
index a8095a94bf75cd5d6d9087509449c159056ebc28..1cc751109f76a96097d363b493322dde182a715d 100644
--- a/lite/operators/match_matrix_tensor_op.cc
+++ b/lite/operators/match_matrix_tensor_op.cc
@@ -42,7 +42,7 @@ bool MatchMatrixTensorOpLite::CheckShape() const {
   return true;
 }
 
-bool MatchMatrixTensorOpLite::InferShape() const {
+bool MatchMatrixTensorOpLite::InferShapeImpl() const {
   const Tensor* x = param_.x;
   const Tensor* y = param_.y;
   DDim x_dims = param_.x->dims();
diff --git a/lite/operators/match_matrix_tensor_op.h b/lite/operators/match_matrix_tensor_op.h
index 404183ea5bda3c35ba8b833853bc0005d60b9f7d..f1070a81b471ded59610af1a5bb40e35ccba7aff 100644
--- a/lite/operators/match_matrix_tensor_op.h
+++ b/lite/operators/match_matrix_tensor_op.h
@@ -32,7 +32,7 @@ class MatchMatrixTensorOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/matmul_op.cc b/lite/operators/matmul_op.cc
index 286ade7b2130ce662eea2b7ba4e142bf489306ca..1cdcdfa16760385db059a4894e35d04bda51a85d 100644
--- a/lite/operators/matmul_op.cc
+++ b/lite/operators/matmul_op.cc
@@ -27,7 +27,7 @@ bool MatMulOpLite::CheckShape() const {
   return true;
 }
 
-bool MatMulOpLite::InferShape() const {
+bool MatMulOpLite::InferShapeImpl() const {
   const auto x_dims = param_.X->dims();
   const auto y_dims = param_.Y->dims();
   bool x_transpose = param_.transpose_X;
diff --git a/lite/operators/matmul_op.h b/lite/operators/matmul_op.h
index 0aa47c89dd2227f70e7264c39b13c019d9b00587..acb9d512f7ac50818e9521ca67e04318397dabb0 100644
--- a/lite/operators/matmul_op.h
+++ b/lite/operators/matmul_op.h
@@ -33,7 +33,7 @@ class MatMulOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
diff --git a/lite/operators/mean_grad_op.cc b/lite/operators/mean_grad_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55e374735ea8d861c65f1296968a40a8b5b1f096
--- /dev/null
+++ b/lite/operators/mean_grad_op.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/mean_grad_op.h"
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool MeanGradOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Out_grad);
+  CHECK_OR_FALSE(param_.X_grad);
+  return true;
+}
+
+bool MeanGradOp::InferShapeImpl() const {
+  param_.X_grad->Resize(param_.X->dims());
+  return true;
+}
+
+bool MeanGradOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
+  CHECK_EQ(opdesc.InputArgumentNames().size(), 2UL);
+  auto X_name = opdesc.Input("X").front();
+  auto Out_grad_name = opdesc.Input("Out@GRAD").front();
+  auto X_grad_name = opdesc.Output("X@GRAD").front();
+
+  param_.X = GetVar<lite::Tensor>(scope, X_name);
+  param_.Out_grad = GetVar<lite::Tensor>(scope, Out_grad_name);
+  param_.X_grad = GetMutableVar<Tensor>(scope, X_grad_name);
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(mean_grad, paddle::lite::operators::MeanGradOp);
diff --git a/lite/operators/mean_grad_op.h b/lite/operators/mean_grad_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..488581a71bb423c09540d17cbb05c170f6f06374
--- /dev/null
+++ b/lite/operators/mean_grad_op.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class MeanGradOp : public OpLite {
+ public:
+  explicit MeanGradOp(const std::string &type) : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "mean_grad"; }
+
+ private:
+  mutable operators::MeanGradParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/mean_op.cc b/lite/operators/mean_op.cc
index 33ad7ed7fe1c2c89339a689d4a6316d85307d871..9a66d4fbda3116ef7bd751f34f66eefd1f2e6e99 100644
--- a/lite/operators/mean_op.cc
+++ b/lite/operators/mean_op.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "lite/operators/mean_op.h"
+#include <vector>
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 
@@ -19,82 +21,28 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
-class MeanOp : public OpLite {
- public:
-  explicit MeanOp(const std::string& type) : OpLite(type) {}
+bool MeanOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
 
-  bool CheckShape() const override {
-    CHECK_OR_FALSE(param_.X);
-    CHECK_OR_FALSE(param_.Out);
-    return true;
-  }
+bool MeanOp::InferShapeImpl() const {
+  param_.Out->Resize(std::vector<int64_t>{1});
+  return true;
+}
 
-  bool InferShape() const override {
-    param_.Out->Resize(std::vector<int64_t>{1});
-    return true;
-  }
+bool MeanOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
+  auto X_name = opdesc.Input("X").front();
+  auto Out_name = opdesc.Output("Out").front();
 
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override {
-    auto X_name = opdesc.Input("X").front();
-    auto Out_name = opdesc.Output("Out").front();
-
-    param_.X = GetVar<lite::Tensor>(scope, X_name);
-    param_.Out = GetMutableVar<Tensor>(scope, Out_name);
-    return true;
-  }
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "mean"; }
-
- private:
-  mutable operators::MeanParam param_;
-};
-
-#ifdef LITE_WITH_TRAIN
-class MeanGradOp : public OpLite {
- public:
-  explicit MeanGradOp(const std::string& type) : OpLite(type) {}
-
-  bool CheckShape() const override {
-    CHECK_OR_FALSE(param_.X);
-    CHECK_OR_FALSE(param_.Out_grad);
-    CHECK_OR_FALSE(param_.X_grad);
-    return true;
-  }
-
-  bool InferShape() const override {
-    param_.X_grad->Resize(param_.X->dims());
-    // param_.X_grad->set_lod(param_.X->lod());
-    return true;
-  }
-
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override {
-    CHECK_EQ(opdesc.InputArgumentNames().size(), 2UL);
-    auto X_name = opdesc.Input("X").front();
-    auto Out_grad_name = opdesc.Input(framework::GradVarName("Out")).front();
-    auto X_grad_name = opdesc.Output(framework::GradVarName("X")).front();
-
-    param_.X = GetVar<lite::Tensor>(scope, X_name);
-    param_.Out_grad = GetVar<lite::Tensor>(scope, Out_grad_name);
-    param_.X_grad = GetMutableVar<Tensor>(scope, X_grad_name);
-    return true;
-  }
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "mean_grad"; }
-
- private:
-  mutable operators::MeanGradParam param_;
-};
-#endif
+  param_.X = GetVar<lite::Tensor>(scope, X_name);
+  param_.Out = GetMutableVar<Tensor>(scope, Out_name);
+  return true;
+}
 
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
 
 REGISTER_LITE_OP(mean, paddle::lite::operators::MeanOp);
-#ifdef LITE_WITH_TRAIN
-REGISTER_LITE_OP(mean_grad, paddle::lite::operators::MeanGradOp);
-#endif
diff --git a/lite/operators/mean_op.h b/lite/operators/mean_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4dff93ce78aa4598bd12fb3181aa5f2bd4820b6
--- /dev/null
+++ b/lite/operators/mean_op.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class MeanOp : public OpLite {
+ public:
+  explicit MeanOp(const std::string &type) : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
+
+  std::string DebugString() const override { return "mean"; }
+
+ private:
+  mutable operators::MeanParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/merge_lod_tensor_op.cc b/lite/operators/merge_lod_tensor_op.cc
index 4258715b1d1aa6bf7fac160dcd6fc8ca6dd3754d..704b5cad6fc80bee8bcb5dfd2921c5cf87182ff8 100644
--- a/lite/operators/merge_lod_tensor_op.cc
+++ b/lite/operators/merge_lod_tensor_op.cc
@@ -34,7 +34,7 @@ bool MergeLodTensorOpLite::CheckShape() const {
   return true;
 }
 
-bool MergeLodTensorOpLite::InferShape() const {
+bool MergeLodTensorOpLite::InferShapeImpl() const {
   auto dims = param_.in_true->dims();
   param_.out->Resize(dims);
   return true;
diff --git a/lite/operators/merge_lod_tensor_op.h b/lite/operators/merge_lod_tensor_op.h
index 788a3451685cd0f42b72ee01e93e17da49507957..ec986fac1988efb5efa262c9fc340c6b450f8ddf 100644
--- a/lite/operators/merge_lod_tensor_op.h
+++ b/lite/operators/merge_lod_tensor_op.h
@@ -31,7 +31,7 @@ class MergeLodTensorOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/mul_grad_op.cc b/lite/operators/mul_grad_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..51e1fb310cb12d83dda9436bb73042c7b22fae11
--- /dev/null
+++ b/lite/operators/mul_grad_op.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/mul_grad_op.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool MulGradOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.y);
+  CHECK_OR_FALSE(param_.output_grad);
+  CHECK_OR_FALSE(param_.x_grad || param_.y_grad);
+  CHECK_OR_FALSE(param_.x_num_col_dims);
+  CHECK_OR_FALSE(param_.y_num_col_dims);
+
+  const auto x_dims = param_.x->dims();
+  const auto y_dims = param_.y->dims();
+  const auto out_dims = param_.output_grad->dims();
+
+  CHECK_GT_OR_FALSE(x_dims.size(), static_cast<size_t>(param_.x_num_col_dims));
+  CHECK_GT_OR_FALSE(y_dims.size(), static_cast<size_t>(param_.y_num_col_dims));
+
+  auto x_flatten_dims = flatten_2d(x_dims, param_.x_num_col_dims);
+  auto y_flatten_dims = flatten_2d(y_dims, param_.y_num_col_dims);
+  auto out_flatten_dims = flatten_2d(out_dims, param_.x_num_col_dims);
+
+  // Out = X * Y;
+  CHECK_EQ_OR_FALSE(x_flatten_dims[1], y_flatten_dims[0]);
+  CHECK_EQ_OR_FALSE(x_flatten_dims[0], out_flatten_dims[0]);
+  CHECK_EQ_OR_FALSE(y_flatten_dims[1], out_flatten_dims[1]);
+  return true;
+}
+
+bool MulGradOpLite::InferShapeImpl() const {
+  const auto x_dims = param_.x->dims();
+  const auto y_dims = param_.y->dims();
+  if (param_.x_grad) {
+    param_.x_grad->Resize(x_dims);
+    param_.x_grad->set_lod(param_.x->lod());
+  }
+  if (param_.y_grad) {
+    param_.y_grad->Resize(y_dims);
+    param_.y_grad->set_lod(param_.y->lod());
+  }
+}
+
+bool MulGradOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  CHECK(!op_desc.Input("X").empty());
+  CHECK(!op_desc.Input("Y").empty());
+  CHECK(!op_desc.Input("Out@GRAD").empty());
+  CHECK(!op_desc.Output("X@GRAD").empty() || !op_desc.Output("Y@GRAD").empty())
+      << "at least one of 'X@GRAD' and 'Y@GRAD' is not empty";
+
+  auto *x_var = scope->FindVar(op_desc.Input("X").front());
+  CHECK(x_var);
+  param_.x = &x_var->Get<Tensor>();
+
+  auto *y_var = scope->FindVar(op_desc.Input("Y").front());
+  CHECK(y_var);
+  param_.y = &y_var->Get<Tensor>();
+
+  auto *out_grad_var = scope->FindVar(op_desc.Input("Out@GRAD").front());
+  CHECK(out_grad_var);
+  param_.output_grad = &out_grad_var->Get<Tensor>();
+
+  if (!op_desc.Output("X@GRAD").empty()) {
+    auto *x_grad_var = scope->FindVar(op_desc.Output("X@GRAD").front());
+    CHECK(x_grad_var);
+    param_.x_grad = x_grad_var->GetMutable<Tensor>();
+  }
+
+  if (!op_desc.Output("Y@GRAD").empty()) {
+    auto *y_grad_var = scope->FindVar(op_desc.Output("Y@GRAD").front());
+    CHECK(y_grad_var);
+    param_.y_grad = y_grad_var->GetMutable<Tensor>();
+  }
+  param_.x_num_col_dims = op_desc.GetAttr<int>("x_num_col_dims");
+  param_.y_num_col_dims = op_desc.GetAttr<int>("y_num_col_dims");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(mul_grad, paddle::lite::operators::MulGradOpLite);
diff --git a/lite/operators/mul_grad_op.h b/lite/operators/mul_grad_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..869aa60c6232000008cb57d110aa454396b2ff34
--- /dev/null
+++ b/lite/operators/mul_grad_op.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class MulGradOpLite : public OpLite {
+ public:
+  MulGradOpLite() {}
+
+  explicit MulGradOpLite(const std::string &type) : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
+
+  std::string DebugString() const override { return "mul_grad"; }
+
+ private:
+  mutable MulGradParam param_;
+};
+
+std::vector<int64_t> flatten_2d(DDim dims, int num_col_dims) {
+  std::vector<int64_t> flatten_dims{1, 1};
+  for (int i = 0; i < dims.size(); i++) {
+    if (i < num_col_dims) {
+      flatten_dims[0] *= dims[i];
+    } else {
+      flatten_dims[1] *= dims[i];
+    }
+  }
+  return flatten_dims;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/mul_op.cc b/lite/operators/mul_op.cc
index c870abdc8989b48d8aa2f14f989ad475c027995e..8641a041e38b7a85ee7f0af8b3536f0b9224b36f 100644
--- a/lite/operators/mul_op.cc
+++ b/lite/operators/mul_op.cc
@@ -35,7 +35,7 @@ bool MulOpLite::CheckShape() const {
   return true;
 }
 
-bool MulOpLite::InferShape() const {
+bool MulOpLite::InferShapeImpl() const {
   const auto x_dims = param_.x->dims();
   const auto y_dims = param_.y->dims();
 
diff --git a/lite/operators/mul_op.h b/lite/operators/mul_op.h
index e53168e00e0e541e6b317e1633a8afbf33018d6e..10a2e2efaa4db0e106e3c56c2f9b1cec9fb55ac4 100644
--- a/lite/operators/mul_op.h
+++ b/lite/operators/mul_op.h
@@ -33,7 +33,7 @@ class MulOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
   // TODO(Superjomn) replace framework::OpDesc with a lite one.
@@ -66,28 +66,6 @@ class MulOpLite : public OpLite {
   mutable MulParam param_;
 };
 
-#ifdef LITE_WITH_TRAIN
-class MulGradOpLite : public OpLite {
- public:
-  MulGradOpLite() {}
-
-  explicit MulGradOpLite(const std::string &type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
-
-  std::string DebugString() const override { return "mul_grad"; }
-
- private:
-  mutable MulGradParam param_;
-};
-#endif
-
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/multiclass_nms_op.cc b/lite/operators/multiclass_nms_op.cc
index 9dba5de4f81a1cba8f66132d89f6321ed76d368c..3102030e4b2cdc40eae369d2a43e9b94287e1873 100644
--- a/lite/operators/multiclass_nms_op.cc
+++ b/lite/operators/multiclass_nms_op.cc
@@ -41,7 +41,7 @@ bool MulticlassNmsOpLite::CheckShape() const {
   return true;
 }
 
-bool MulticlassNmsOpLite::InferShape() const {
+bool MulticlassNmsOpLite::InferShapeImpl() const {
   auto box_dims = param_.bboxes->dims();
   auto score_dims = param_.scores->dims();
   auto score_size = score_dims.size();
@@ -84,3 +84,4 @@ bool MulticlassNmsOpLite::AttachImpl(const cpp::OpDesc& opdesc,
 }  // namespace paddle
 
 REGISTER_LITE_OP(multiclass_nms, paddle::lite::operators::MulticlassNmsOpLite);
+REGISTER_LITE_OP(multiclass_nms2, paddle::lite::operators::MulticlassNmsOpLite);
diff --git a/lite/operators/multiclass_nms_op.h b/lite/operators/multiclass_nms_op.h
index 7be0d17d7478bdcfb4c4c6b1f22e505fb9da0846..f74479f3c9a42e6f5ec06126fedf91a2e17b6c2f 100644
--- a/lite/operators/multiclass_nms_op.h
+++ b/lite/operators/multiclass_nms_op.h
@@ -29,7 +29,7 @@ class MulticlassNmsOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/negative_op.cc b/lite/operators/negative_op.cc
index 4db1dd4feede42fc4267eb3fc3553c538807f1a8..2b98f0a90af812ac9c524368e41177377f4d69e2 100644
--- a/lite/operators/negative_op.cc
+++ b/lite/operators/negative_op.cc
@@ -26,7 +26,7 @@ bool NegativeOpLite::CheckShape() const {
   return true;
 }
 
-bool NegativeOpLite::InferShape() const {
+bool NegativeOpLite::InferShapeImpl() const {
   lite::DDim input_dims;
   input_dims = param_.X->dims();
   param_.Out->Resize(lite::DDim(input_dims));
diff --git a/lite/operators/negative_op.h b/lite/operators/negative_op.h
index 83f1008c9630284956347b87151e58f49588b867..04ec92532559c050cc5a9e8ac6bdf9a817e0dc70 100644
--- a/lite/operators/negative_op.h
+++ b/lite/operators/negative_op.h
@@ -30,7 +30,7 @@ class NegativeOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/norm_op.cc b/lite/operators/norm_op.cc
index dff26966d48889389e2837194c2bc5a96fc960e5..0513e5c942d73397f269f1fe7bb89572a97ae548 100644
--- a/lite/operators/norm_op.cc
+++ b/lite/operators/norm_op.cc
@@ -25,7 +25,7 @@ bool NormOp::CheckShape() const {
   return true;
 }
 
-bool NormOp::InferShape() const {
+bool NormOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   auto out_dims = param_.X->dims();
diff --git a/lite/operators/norm_op.h b/lite/operators/norm_op.h
index ae4594ed023d47179a7125bd9183e39f505ae16b..5c69d959be81eaccddc396dadacf920493ef99f5 100644
--- a/lite/operators/norm_op.h
+++ b/lite/operators/norm_op.h
@@ -30,7 +30,7 @@ class NormOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 9aba4a1f3e7b96abedb2f4d835f99072bf4b7f4e..1e221a602a426f3f117c69b9525f2a1d85880ee0 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -24,6 +24,7 @@
 #include "lite/model_parser/cpp/block_desc.h"
 #include "lite/model_parser/desc_apis.h"
 #include "lite/utils/all.h"
+#include "lite/utils/variant.h"
 /*
  * This file contains all the argument parameter data structure for operators.
  */
@@ -32,6 +33,16 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
+struct ParamBase {
+ public:
+  const std::vector<Tensor*>* input_tensor_ptrs() const { return nullptr; }
+  std::vector<Tensor*>* output_tensor_ptrs() { return nullptr; }
+
+ protected:
+  std::shared_ptr<std::vector<const Tensor*>> input_tensor_ptrs_cache_{nullptr};
+  std::shared_ptr<std::vector<Tensor*>> output_tensor_ptrs_cache_{nullptr};
+};
+
 using param_t = Any;
 #define WITH_INT8_CONFIG             \
   bool enable_int8{false};           \
@@ -41,36 +52,38 @@ using param_t = Any;
   int bit_length{8};
 
 /// ----------------------- Functional operators ------------------------------
-struct FeedParam {
+struct FeedParam : ParamBase {
   std::vector<lite::Tensor>* feed_list{};
   lite::Tensor* out{};
   int col;
 };
 
-struct FetchParam {
+struct FetchParam : ParamBase {
   const lite::Tensor* input{};
   std::vector<lite::Tensor>* fetch_list{};
   int col;
 };
 
 // Helper op for lite framework
-struct IoCopyParam {
+struct IoCopyParam : ParamBase {
   const lite::Tensor* x{};
   lite::Tensor* y{};
+  int process_type{0};
 };
 
-struct LayoutParam {
+struct LayoutParam : ParamBase {
   const lite::Tensor* x{};
   lite::Tensor* y{};
+  int process_type{0};
 };
 
-struct CalibParam {
+struct CalibParam : ParamBase {
   const lite::Tensor* input{};
   lite::Tensor* output{};
   float scale;
 };
 
-struct SubgraphParam {
+struct SubgraphParam : ParamBase {
   std::vector<std::string> input_names{};
   std::vector<std::string> output_names{};
   std::vector<std::string> input_data_names{};
@@ -82,7 +95,7 @@ struct SubgraphParam {
 
 /// -------------------------- NN operators ------------------------------------
 
-struct FcParam {
+struct FcParam : ParamBase {
   lite::Tensor* input{nullptr};
   lite::Tensor* w{nullptr};
   lite::Tensor* bias{nullptr};
@@ -93,9 +106,24 @@ struct FcParam {
   bool padding_weights{false};
   // for int8
   WITH_INT8_CONFIG
-};
-
-struct SearchSeqFcParam {
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({input}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
+};
+
+struct SearchSeqFcParam : ParamBase {
   lite::Tensor* x{nullptr};
   lite::Tensor* w{nullptr};
   lite::Tensor* b{nullptr};
@@ -104,7 +132,7 @@ struct SearchSeqFcParam {
 };
 
 // For Interpolate Op
-struct InterpolateParam {
+struct InterpolateParam : ParamBase {
   lite::Tensor* X{};
   lite::Tensor* OutSize{};
   lite::Tensor* Out{};
@@ -121,7 +149,7 @@ struct InterpolateParam {
 };
 
 // For Mul Op
-struct MulParam {
+struct MulParam : ParamBase {
   const lite::Tensor* x{};
   const lite::Tensor* y{};
   lite::Tensor* output{};
@@ -132,7 +160,7 @@ struct MulParam {
   WITH_INT8_CONFIG
 };
 
-struct MulGradParam {
+struct MulGradParam : ParamBase {
   const lite::Tensor* x{};
   const lite::Tensor* y{};
   const lite::Tensor* output_grad{};
@@ -144,7 +172,7 @@ struct MulGradParam {
 };
 
 // For ReduceMean Op
-struct ReduceMeanParam {
+struct ReduceMeanParam : ParamBase {
   lite::Tensor* X{};
   lite::Tensor* Out{};
 
@@ -153,7 +181,7 @@ struct ReduceMeanParam {
 };
 
 // For Stack Op
-struct StackParam {
+struct StackParam : ParamBase {
   std::vector<lite::Tensor*> X;
   lite::Tensor* Out{};
 
@@ -161,7 +189,7 @@ struct StackParam {
 };
 
 // For Power Op
-struct PowerParam {
+struct PowerParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
 
@@ -170,7 +198,7 @@ struct PowerParam {
   float power{};
 };
 
-struct ShuffleChannelParam {
+struct ShuffleChannelParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
 
@@ -178,7 +206,7 @@ struct ShuffleChannelParam {
 };
 
 // For Yolobox
-struct YoloBoxParam {
+struct YoloBoxParam : ParamBase {
   lite::Tensor* X{};
   lite::Tensor* ImgSize{};
   lite::Tensor* Boxes{};
@@ -191,7 +219,7 @@ struct YoloBoxParam {
 };
 
 // For Scale Op
-struct ScaleParam {
+struct ScaleParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* output{};
 
@@ -201,14 +229,29 @@ struct ScaleParam {
 };
 
 // For Softmax op
-struct SoftmaxParam {
+struct SoftmaxParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* output{};
   int axis{-1};
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 // For Reshape and Reshape2 Op
-struct ReshapeParam {
+struct ReshapeParam : ParamBase {
   const lite::Tensor* x{};
   std::vector<const lite::Tensor*> shape_tensor_vct{};
   const lite::Tensor* shape_tensor{};
@@ -220,7 +263,7 @@ struct ReshapeParam {
 };
 
 // For Concat op
-struct ConcatParam {
+struct ConcatParam : ParamBase {
   std::vector<lite::Tensor*> x{};
   lite::Tensor* output{};
   int axis{0};
@@ -228,7 +271,7 @@ struct ConcatParam {
 };
 
 /// ----------------------- activation operators ----------------------
-struct ActivationParam {
+struct ActivationParam : ParamBase {
   const lite::Tensor* X{};
   float Leaky_relu_alpha{0};   // leaky_relu param
   float Relu_clipped_coef{6};  // relu_clipped param
@@ -243,7 +286,7 @@ struct ActivationParam {
   lite_api::ActivationType active_type;
 };
 
-struct ActivationGradParam {
+struct ActivationGradParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Out{};
   // for backward
@@ -252,7 +295,7 @@ struct ActivationGradParam {
 };
 
 // For Convolution op
-struct ConvParam {
+struct ConvParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* filter{};
   lite::Tensor* bias{nullptr};
@@ -292,10 +335,26 @@ struct ConvParam {
   std::vector<int> output_size;
   // for int8
   WITH_INT8_CONFIG
+
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({x}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({output}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
 };
 
 // For BatchNorm op
-struct BatchNormParam {
+struct BatchNormParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* bias{};
   lite::Tensor* scale{};
@@ -314,7 +373,7 @@ struct BatchNormParam {
 };
 
 // For Pooling op
-struct PoolParam {
+struct PoolParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* output{};
   std::string pooling_type{""};
@@ -338,7 +397,7 @@ struct PoolParam {
 };
 
 // For Dropout op
-struct DropoutParam {
+struct DropoutParam : ParamBase {
   const lite::Tensor* x{};
   lite::Tensor* output{};
   lite::Tensor* mask{};
@@ -350,7 +409,7 @@ struct DropoutParam {
 };
 
 // For Split op
-struct SplitParam {
+struct SplitParam : ParamBase {
   lite::Tensor* x{};
   std::vector<lite::Tensor*> output{};
   lite::Tensor* axis_tensor;
@@ -362,7 +421,7 @@ struct SplitParam {
 };
 
 // For Transpose op
-struct TransposeParam {
+struct TransposeParam : ParamBase {
   const lite::Tensor* x{};
   lite::Tensor* output{};
   lite::Tensor* xshape{};
@@ -373,7 +432,7 @@ struct TransposeParam {
 };
 
 /// ----------------------- element wise operators ----------------------
-struct ElementwiseParam {
+struct ElementwiseParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   lite::Tensor* Out{};
@@ -382,13 +441,29 @@ struct ElementwiseParam {
   WITH_INT8_CONFIG
   float x_input_scale{1.0};
   float y_input_scale{1.0};
-};
-
-struct ElementwiseGradParam {
+  ///////////////////////////////////////////////////////////////////////////////////
+  // get a vector of input tensors
+  const std::vector<const Tensor*>* input_tensor_ptrs() {
+    if (UNLIKELY(input_tensor_ptrs_cache_)) {
+      input_tensor_ptrs_cache_.reset(new std::vector<const Tensor*>({X, Y}));
+    }
+    return input_tensor_ptrs_cache_.get();
+  }
+  // get a vector of output tensors
+  const std::vector<Tensor*>* output_tensor_ptrs() {
+    if (UNLIKELY(output_tensor_ptrs_cache_)) {
+      output_tensor_ptrs_cache_.reset(new std::vector<lite::Tensor*>({Out}));
+    }
+    return output_tensor_ptrs_cache_.get();
+  }
+};
+
+struct ElementwiseGradParam : ParamBase {
+  const lite::Tensor* X{};
   const lite::Tensor* Y{};
-  const lite::Tensor* Out_grad{};
-  lite::Tensor* X_grad{};
-  lite::Tensor* Y_grad{};
+  const lite::Tensor* OutGrad{};
+  lite::Tensor* XGrad{};
+  lite::Tensor* YGrad{};
   int axis{-1};  // for broadcasting.
 };
 
@@ -401,12 +476,12 @@ struct FusionElementwiseActivationGradParam : public ElementwiseGradParam {
 };
 
 /// ----------------------- mean operators ----------------------
-struct MeanParam {
+struct MeanParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
 };
 
-struct MeanGradParam {
+struct MeanGradParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Out_grad{};
   // for backward
@@ -414,42 +489,33 @@ struct MeanGradParam {
 };
 
 /// ----------------------- fill_constant operators ----------------------
-struct FillConstantParam {
+struct FillConstantParam : ParamBase {
   int dtype{static_cast<int>(VarDescAPI::VarDataType::FP32)};
   std::vector<int64_t> shape{};
-  lite::Tensor* shape_tensor;
+  lite::Tensor* shape_tensor{nullptr};
   std::vector<lite::Tensor*> shape_tensor_list{};
 
-  float value{0.0f};
-  // useless for x86, keep it for compatibility
-  bool force_cpu{false};
-  lite::Tensor* Out{};
-};
-struct FillConstantBatchLikeParam {
-  int dtype{static_cast<int>(VarDescAPI::VarDataType::FP32)};
-  std::vector<int64_t> shape{};
   float value{0.0f};
   // useless for x86, keep it for compatibility
   bool force_cpu{false};
   lite::Tensor* out{};
-  const lite::Tensor* input{};
-  int input_dim_idx{0};
-  int output_dim_idx{0};
 };
 
-struct FillConstantBatchSizeLikeParam {
-  lite::Tensor* Input;
-  lite::Tensor* Out;
+struct FillConstantBatchSizeLikeParam : ParamBase {
+  const lite::Tensor* input{nullptr};
+  lite::Tensor* out{nullptr};
 
-  std::vector<int> shape;
+  std::vector<int> shape{};
   int input_dim_idx{0};
   int output_dim_idx{0};
   int dtype{static_cast<int>(VarDescAPI::VarDataType::FP32)};
   float value{0.0f};
+  // useless for x86, keep it for compatibility
+  bool force_cpu{false};
 };
 
 //
-struct FakeQuantizeMovingAvgMaxAbsParam {
+struct FakeQuantizeMovingAvgMaxAbsParam : ParamBase {
   const lite::Tensor* x{};
   const lite::Tensor* in_scale{};
   const lite::Tensor* in_accum{};
@@ -463,14 +529,14 @@ struct FakeQuantizeMovingAvgMaxAbsParam {
   float moving_rate{0.9};
 };
 
-struct FakeDequantizeMaxAbsParam {
+struct FakeDequantizeMaxAbsParam : ParamBase {
   const lite::Tensor* x{};
   const lite::Tensor* in_scale{};
   lite::Tensor* out{};
   float max_range;
 };
 
-struct FakeChannelWiseDequantizeMaxAbsParam {
+struct FakeChannelWiseDequantizeMaxAbsParam : ParamBase {
   const lite::Tensor* x{};
   std::vector<const lite::Tensor*> scale_tensors{};
   lite::Tensor* out{};
@@ -478,7 +544,7 @@ struct FakeChannelWiseDequantizeMaxAbsParam {
 };
 
 /// ----------------------- sgd operators ----------------------
-struct SGDParam {
+struct SGDParam : ParamBase {
   int dtype{static_cast<int>(VarDescAPI::VarDataType::FP32)};
 
   const lite::Tensor* Param{};
@@ -488,7 +554,7 @@ struct SGDParam {
 };
 
 /// ----------------------- uniform_random operators ----------------------
-struct UniformRandomParam {
+struct UniformRandomParam : ParamBase {
   std::vector<int64_t> shape{};
   float min{-1.0f};
   float max{1.0f};
@@ -497,12 +563,12 @@ struct UniformRandomParam {
   lite::Tensor* Out{};
 };
 /// ----------------------- negative operators --------------
-struct NegativeParam {
+struct NegativeParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
 };
 /// ----------------------- pad2d operators ----------------------
-struct Pad2dParam {
+struct Pad2dParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   std::vector<int> paddings{0, 0, 0, 0};
@@ -512,7 +578,7 @@ struct Pad2dParam {
 };
 
 /// ----------------------- Crop operators ----------------------
-struct CropParam {
+struct CropParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   std::vector<int> offsets;
@@ -520,21 +586,21 @@ struct CropParam {
 };
 
 ///----------------------- argmax operators ----------------------
-struct ArgmaxParam {
+struct ArgmaxParam : ParamBase {
   lite::Tensor* X{};
   lite::Tensor* Out{};
   int Axis{0};
 };
 
 ///----------------------- axpy operators ----------------------
-struct AxpyParam {
+struct AxpyParam : ParamBase {
   lite::Tensor* Scale{};
   lite::Tensor* X{};
   lite::Tensor* Bias{};
   lite::Tensor* Out{};
 };
 /// ----------------------- GRU unit operators ----------------------f
-struct GRUUnitParam {
+struct GRUUnitParam : ParamBase {
   enum ActType { identity, sigmoid, tanh, relu };
   const lite::Tensor* input{nullptr};
   const lite::Tensor* hidden_prev{nullptr};
@@ -550,7 +616,7 @@ struct GRUUnitParam {
 };
 
 /// ------------------------------ lrn operators ------------------------------
-struct LrnParam {
+struct LrnParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   int n{5};
@@ -561,7 +627,7 @@ struct LrnParam {
 };
 
 /// ----------------------- decode_bboxes operators ----------------------
-struct DecodeBboxesParam {
+struct DecodeBboxesParam : ParamBase {
   const lite::Tensor* loc_data{};
   const lite::Tensor* prior_data{};
   lite::Tensor* bbox_data{};
@@ -577,7 +643,7 @@ struct DecodeBboxesParam {
 };
 
 /// ----------------------- box_coder operators ----------------------
-struct BoxCoderParam {
+struct BoxCoderParam : ParamBase {
   const lite::Tensor* prior_box{};
   const lite::Tensor* prior_box_var{};
   const lite::Tensor* target_box{};
@@ -590,7 +656,7 @@ struct BoxCoderParam {
 };
 
 /// ----------------------- multiclass_nms operators ----------------------
-struct MulticlassNmsParam {
+struct MulticlassNmsParam : ParamBase {
   const lite::Tensor* bboxes{};
   const lite::Tensor* scores{};
   lite::Tensor* out{};
@@ -605,7 +671,7 @@ struct MulticlassNmsParam {
 };
 
 /// ----------------------- priorbox operators ----------------------
-struct PriorBoxParam {
+struct PriorBoxParam : ParamBase {
   lite::Tensor* input{};
   lite::Tensor* image{};
   lite::Tensor* boxes{};
@@ -634,7 +700,7 @@ struct DensityPriorBoxParam : public PriorBoxParam {
   std::vector<int> density_sizes;
 };
 /// ----------------------- GRU operators ----------------------f
-struct GRUParam {
+struct GRUParam : ParamBase {
   const lite::Tensor* input{nullptr};
   const lite::Tensor* h0{nullptr};
   const lite::Tensor* weight{nullptr};
@@ -651,7 +717,7 @@ struct GRUParam {
 };
 
 /// ----------------------- BeamSearchDecode operators ----------------------f
-struct BeamSearchDecodeParam {
+struct BeamSearchDecodeParam : ParamBase {
   std::vector<lite::Tensor>* ids{nullptr};
   std::vector<lite::Tensor>* scores{nullptr};
   lite::Tensor* sentence_ids{nullptr};
@@ -661,14 +727,21 @@ struct BeamSearchDecodeParam {
 };
 
 /// ----------------------- LookupTable operators ----------------------f
-struct LookupTableParam {
+struct LookupTableParam : ParamBase {
+  const lite::Tensor* W{nullptr};
+  const lite::Tensor* Ids{nullptr};
+  lite::Tensor* Out{nullptr};
+  int64_t padding_idx{-1};
+};
+
+struct LookupTableDequantParam : ParamBase {
   lite::Tensor* W{nullptr};
   lite::Tensor* Ids{nullptr};
   lite::Tensor* Out{nullptr};
   int64_t padding_idx{-1};
 };
 
-struct Im2SequenceParam {
+struct Im2SequenceParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   lite::Tensor* Out{};
@@ -678,19 +751,19 @@ struct Im2SequenceParam {
   std::vector<int> out_strides{1, 1};
 };
 
-struct SequenceSoftmaxParam {
+struct SequenceSoftmaxParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
 };
 
-struct NormParam {
+struct NormParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   lite::Tensor* Norm{};
   int axis{1};
   float epsilon{1e-10};
 };
-struct LayerNormParam {
+struct LayerNormParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Scale{};
   const lite::Tensor* Bias{};
@@ -701,13 +774,13 @@ struct LayerNormParam {
   float epsilon{1e-5};
 };
 
-struct LogicalParam {
+struct LogicalParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   lite::Tensor* Out{};
 };
 
-struct CompareParam {
+struct CompareParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   bool force_cpu{0};
@@ -715,7 +788,7 @@ struct CompareParam {
   lite::Tensor* Out{};
 };
 
-struct WhileParam {
+struct WhileParam : ParamBase {
   Scope* scope{};
   Tensor* cond{};
   cpp::BlockDesc* sub_block{};
@@ -723,32 +796,32 @@ struct WhileParam {
   std::vector<Tensor*> outs{};
 };
 
-struct TopkParam {
+struct TopkParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   lite::Tensor* Indices{};
   int K{1};
 };
 
-struct IncrementParam {
+struct IncrementParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   float step{1};
 };
 
-struct WriteToArrayParam {
-  const lite::Tensor* X{};
-  const lite::Tensor* I{};
-  std::vector<lite::Tensor>* Out{};
+struct WriteToArrayParam : ParamBase {
+  const lite::Tensor* X{nullptr};
+  const lite::Tensor* I{nullptr};
+  std::vector<lite::Tensor>* Out{nullptr};
 };
 
-struct ReadFromArrayParam {
-  std::vector<lite::Tensor>* X{};
-  lite::Tensor* I{};
-  lite::Tensor* Out{};
+struct ReadFromArrayParam : ParamBase {
+  const std::vector<lite::Tensor>* X{nullptr};
+  const lite::Tensor* I{nullptr};
+  lite::Tensor* Out{nullptr};
 };
 
-struct BeamSearchParam {
+struct BeamSearchParam : ParamBase {
   const lite::Tensor* pre_ids{};
   const lite::Tensor* pre_scores{};
   const lite::Tensor* ids{};
@@ -762,7 +835,7 @@ struct BeamSearchParam {
   bool is_accumulated;
 };
 
-struct SequencePoolParam {
+struct SequencePoolParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   std::string pool_type{"AVERAGE"};
@@ -772,13 +845,22 @@ struct SequencePoolParam {
 #endif
 };
 
-struct SequencePoolConcatParam {
+struct SequenceConvParam : ParamBase {
+  const lite::Tensor* X{};
+  const lite::Tensor* Filter{};
+  lite::Tensor* Out{};
+  int contextStart{0};
+  int contextStride{1};
+  int contextLength;
+};
+
+struct SequencePoolConcatParam : ParamBase {
   std::vector<lite::Tensor*> X{};
   lite::Tensor* Out{};
   std::vector<std::string> pool_type{};
 };
 
-struct SearchGroupPaddingParam {
+struct SearchGroupPaddingParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* out_emb_padding{};
   lite::Tensor* out_new{};
@@ -786,36 +868,36 @@ struct SearchGroupPaddingParam {
   int pad_id;
 };
 
-struct SequenceReshapeParam {
+struct SequenceReshapeParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* output{};
   int new_dim;
 };
 
-struct SequenceExpandParam {
+struct SequenceExpandParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   lite::Tensor* Out{};
   int ref_level{-1};
 };
 
-struct SequenceExpandAsParam {
+struct SequenceExpandAsParam : ParamBase {
   const lite::Tensor* x{nullptr};
   const lite::Tensor* y{nullptr};
   lite::Tensor* out{nullptr};
 };
 
-struct SequenceReverseParam {
+struct SequenceReverseParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
 };
 
-struct SequenceConcatParam {
+struct SequenceConcatParam : ParamBase {
   std::vector<lite::Tensor*> X{};
   lite::Tensor* Out{};
 };
 
-struct AttentionPaddingMaskParam {
+struct AttentionPaddingMaskParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   int pad_id;
@@ -824,21 +906,21 @@ struct AttentionPaddingMaskParam {
   lite::Tensor* pad_begin{};
 };
 
-struct SequenceArithmeticParam {
+struct SequenceArithmeticParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   int op_type{1};
   lite::Tensor* Out{};
 };
 
-struct ReduceMaxParam {
+struct ReduceMaxParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   std::vector<int> dim{};
   bool keep_dim{false};
 };
 
-struct LodResetParam {
+struct LodResetParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   lite::Tensor* Out{};
@@ -846,12 +928,12 @@ struct LodResetParam {
   bool append;
 };
 
-struct IsEmptyParam {
+struct IsEmptyParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
 };
 
-struct ReduceParam {
+struct ReduceParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* output{};
   std::vector<int> dim{0};
@@ -859,7 +941,7 @@ struct ReduceParam {
   bool reduce_all{false};
 };
 
-struct VarConv2DParam {
+struct VarConv2DParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* ROW{};
   const lite::Tensor* COLUMN{};
@@ -878,19 +960,19 @@ struct VarConv2DParam {
 };
 
 /// ----------------------- shape operators ----------------------
-struct ShapeParam {
+struct ShapeParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
 };
 
-struct CastParam {
+struct CastParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   int out_dtype{2};
   int in_dtype{2};
 };
 
-struct SliceParam {
+struct SliceParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   std::vector<int> axes{};
@@ -904,7 +986,7 @@ struct SliceParam {
   lite::Tensor* EndsTensor{nullptr};
 };
 
-struct AffineChannelParam {
+struct AffineChannelParam : ParamBase {
   const lite::Tensor* X{};  // X is 4D tensor
   const lite::Tensor* Scale{};
   const lite::Tensor* Bias{};
@@ -912,7 +994,7 @@ struct AffineChannelParam {
   lite::Tensor* Out{};
 };
 
-struct AnchorGeneratorParam {
+struct AnchorGeneratorParam : ParamBase {
   const lite::Tensor* Input{};
   std::vector<float> anchor_sizes{};
   std::vector<float> aspect_ratios{};
@@ -924,7 +1006,7 @@ struct AnchorGeneratorParam {
   lite::Tensor* Variances{};
 };
 
-struct GenerateProposalsParam {
+struct GenerateProposalsParam : ParamBase {
   // inputs
   const lite::Tensor* Scores{};
   const lite::Tensor* BboxDeltas{};
@@ -944,14 +1026,14 @@ struct GenerateProposalsParam {
   lite::Tensor* RpnRoiProbs{};
 };
 /// ----------------------- squeeze operators ----------------------
-struct SqueezeParam {
+struct SqueezeParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   lite::Tensor* XShape{};
   std::vector<int> axes{};
 };
 
-struct UnsqueezeParam {
+struct UnsqueezeParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   lite::Tensor* XShape{};
@@ -961,14 +1043,14 @@ struct UnsqueezeParam {
 };
 
 /// ----------------------- expand operators ----------------------
-struct ExpandParam {
+struct ExpandParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
   std::vector<int> expand_times{};
 };
 
 /// ----------------------- matmul operators ----------------------
-struct MatMulParam {
+struct MatMulParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Y{};
   lite::Tensor* Out{};
@@ -977,20 +1059,20 @@ struct MatMulParam {
   float alpha{1.0f};
 };
 
-struct GatherParam {
+struct GatherParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Index{};
   lite::Tensor* Out{};
 };
 
 /// ----------------------- assign operators -----------------------
-struct AssignParam {
+struct AssignParam : ParamBase {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
 };
 
 /// ----------------------- roi_align operators -----------------------
-struct RoiAlignParam {
+struct RoiAlignParam : ParamBase {
   lite::Tensor* X{};
   lite::Tensor* ROIs{};
   lite::Tensor* Out{};
@@ -1001,13 +1083,13 @@ struct RoiAlignParam {
 };
 
 /// ----------------------- box_clip operators -----------------------
-struct BoxClipParam {
+struct BoxClipParam : ParamBase {
   const lite::Tensor* Input{};
   const lite::Tensor* ImInfo{};
   lite::Tensor* Output{};
 };
 
-struct RangeParam {
+struct RangeParam : ParamBase {
   const lite::Tensor* Start;
   const lite::Tensor* End;
   const lite::Tensor* Step;
@@ -1015,7 +1097,7 @@ struct RangeParam {
 };
 
 /// ----------------------- assign_value operators -----------------------
-struct AssignValueParam {
+struct AssignValueParam : ParamBase {
   std::vector<int> shape{};
   int dtype{};
   std::vector<float> fp32_values{};
@@ -1024,7 +1106,7 @@ struct AssignValueParam {
 };
 
 /// --------------- sequence_topk_avg_pooling operators ------------------
-struct SequenceTopkAvgPoolingParam {
+struct SequenceTopkAvgPoolingParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* ROW{};
   const lite::Tensor* COLUMN{};
@@ -1035,7 +1117,7 @@ struct SequenceTopkAvgPoolingParam {
 };
 
 /// --------------- search_fc operators ------------------
-struct SearchFcParam {
+struct SearchFcParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* W{};
   const lite::Tensor* b{};
@@ -1043,7 +1125,7 @@ struct SearchFcParam {
   int out_size{};
 };
 /// --------------------- match_matrix_tensor operators --------------------
-struct MatchMatrixTensorParam {
+struct MatchMatrixTensorParam : ParamBase {
   const lite::Tensor* x{};
   const lite::Tensor* y{};
   const lite::Tensor* w{};
@@ -1054,14 +1136,14 @@ struct MatchMatrixTensorParam {
 };
 
 /// --------------------- search_seq_depadding operators --------------------
-struct SearchSeqDepaddingParam {
+struct SearchSeqDepaddingParam : ParamBase {
   const lite::Tensor* pad{};
   const lite::Tensor* src{};
   lite::Tensor* out{};
 };
 
 /// --------------------- search_grnn operators --------------------
-struct SearchGrnnParam {
+struct SearchGrnnParam : ParamBase {
   const lite::Tensor* x{};
   const lite::Tensor* wi{};
   const lite::Tensor* wh{};
@@ -1074,7 +1156,7 @@ struct SearchGrnnParam {
   lite::Tensor* layout_input{};
 };
 
-struct SplitLodTensorParam {
+struct SplitLodTensorParam : ParamBase {
   const lite::Tensor* x{};
   const lite::Tensor* mask{};
   lite::Tensor* out_true{};
@@ -1082,7 +1164,7 @@ struct SplitLodTensorParam {
   int level{};
 };
 
-struct MergeLodTensorParam {
+struct MergeLodTensorParam : ParamBase {
   const lite::Tensor* x{};
   const lite::Tensor* mask{};
   const lite::Tensor* in_true{};
@@ -1091,7 +1173,7 @@ struct MergeLodTensorParam {
   int level{};
 };
 
-struct ConditionalBlockParam {
+struct ConditionalBlockParam : ParamBase {
   const lite::Tensor* cond{};
   std::vector<lite::Tensor*> x{};
   std::vector<lite::Tensor*> outs{};
@@ -1100,14 +1182,14 @@ struct ConditionalBlockParam {
   bool is_scalar_condition{};
 };
 
-struct CollectFpnProposalsParam {
+struct CollectFpnProposalsParam : ParamBase {
   std::vector<lite::Tensor*> multi_level_rois{};
   std::vector<lite::Tensor*> multi_level_scores{};
   lite::Tensor* fpn_rois{};
   int post_nms_topN{};
 };
 
-struct DistributeFpnProposalsParam {
+struct DistributeFpnProposalsParam : ParamBase {
   const lite::Tensor* fpn_rois{};
   std::vector<lite::Tensor*> multi_fpn_rois{};
   lite::Tensor* restore_index{};
@@ -1118,7 +1200,7 @@ struct DistributeFpnProposalsParam {
 };
 
 /// --------------------- instance_norm operators --------------------
-struct InstanceNormParam {
+struct InstanceNormParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* out{};
   lite::Tensor* bias{};
@@ -1128,11 +1210,35 @@ struct InstanceNormParam {
   float epsilon;
 };
 /// --------------------- grid sampler operators --------------------
-struct GridSamplerParam {
+struct GridSamplerParam : ParamBase {
   lite::Tensor* x{};
   lite::Tensor* out{};
   lite::Tensor* grid{};
 };
+struct LstmParam : ParamBase {
+  lite::Tensor* Input{};
+  lite::Tensor* Weight{};
+  lite::Tensor* Bias{};
+  lite::Tensor* Hidden{};
+  lite::Tensor* Cell{};
+  lite::Tensor* BatchGate{};
+  lite::Tensor* BatchCellPreAct{};
+  lite::Tensor* H0{nullptr};
+  lite::Tensor* C0{nullptr};
+  bool use_peepholes;
+  bool is_reverse;
+  std::string gate_activation;
+  std::string cell_activation;
+  std::string candidate_activation;
+};
+
+struct CrfDecodingParam : ParamBase {
+  lite::Tensor* emission{};
+  lite::Tensor* transition{};
+  lite::Tensor* label{};
+  lite::Tensor* length{};
+  lite::Tensor* viterbi_path{};
+};
 
 }  // namespace operators
 }  // namespace lite
diff --git a/lite/operators/pad2d_op.cc b/lite/operators/pad2d_op.cc
index 09deed89072512fa0e00bd0be080e8ff8f8a6cec..7af657c888f9b1b28a1b273a193be59e2ace895c 100644
--- a/lite/operators/pad2d_op.cc
+++ b/lite/operators/pad2d_op.cc
@@ -30,7 +30,7 @@ bool Pad2dOpLite::CheckShape() const {
   return true;
 }
 
-bool Pad2dOpLite::InferShape() const {
+bool Pad2dOpLite::InferShapeImpl() const {
   // nchw
   auto x_dims = param_.X->dims();
   int out_h = x_dims[2] + param_.paddings[0] + param_.paddings[1];
@@ -46,7 +46,20 @@ bool Pad2dOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
       scope->FindVar(op_desc.Output("Out").front())->GetMutable<Tensor>();
   param_.mode = op_desc.GetAttr<std::string>("mode");
   param_.pad_value = op_desc.GetAttr<float>("pad_value");
-  param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+  if (op_desc.HasAttr("variable_padding") &&
+      op_desc.GetAttr<bool>("variable_paddings")) {
+    auto Paddings =
+        scope->FindVar(op_desc.Input("Paddings").front())->GetMutable<Tensor>();
+    auto ptr = Paddings->data<int>();
+    if (Paddings->dims().size() < 4) {
+      printf("Paddings size must be four: %d \n",
+             static_cast<int>(Paddings->dims().size()));
+      return false;
+    }
+    param_.paddings = {ptr[0], ptr[1], ptr[2], ptr[3]};
+  } else {
+    param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+  }
   param_.data_format = op_desc.GetAttr<std::string>("data_format");
   return true;
 }
diff --git a/lite/operators/pad2d_op.h b/lite/operators/pad2d_op.h
index c51a76a7aef5624b1480fd1b1cdf56bf23c63674..c6d2e565483655c6279af8318434f129ec92a5e5 100644
--- a/lite/operators/pad2d_op.h
+++ b/lite/operators/pad2d_op.h
@@ -30,7 +30,7 @@ class Pad2dOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/pool_op.cc b/lite/operators/pool_op.cc
index c6f6eed28f8cdb5f080b6f4367a1b88b1dbc0701..5fb990928ec1ae723bc12b695af1be5e50da5079 100644
--- a/lite/operators/pool_op.cc
+++ b/lite/operators/pool_op.cc
@@ -60,7 +60,7 @@ int PoolOutputSize(int input_size,
   return output_size;
 }
 
-bool PoolOpLite::InferShape() const {
+bool PoolOpLite::InferShapeImpl() const {
   const auto x_dims = param_.x->dims();
   std::vector<int>& ksize = param_.ksize;
   // dynamic update 4-pad
diff --git a/lite/operators/pool_op.h b/lite/operators/pool_op.h
index c44875ff95b554ca92cf5288597a5bdaf2cb1bf8..3fcf37e6348628d489e9a2097e2c8dac7eba3e3c 100644
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
@@ -37,7 +37,7 @@ class PoolOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   // TODO(Superjomn) replace framework::OpDesc with a lite one.
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
diff --git a/lite/operators/power_op.cc b/lite/operators/power_op.cc
index 578d95ad53ffe0481288934a7a04d0f9e4442440..83c9edfaca1505746640280633bf6d47cddc6146 100644
--- a/lite/operators/power_op.cc
+++ b/lite/operators/power_op.cc
@@ -27,7 +27,7 @@ bool PowerOp::CheckShape() const {
   return true;
 }
 
-bool PowerOp::InferShape() const {
+bool PowerOp::InferShapeImpl() const {
   param_.Out->Resize(param_.X->dims());
   return true;
 }
diff --git a/lite/operators/power_op.h b/lite/operators/power_op.h
index a6d43f4394a8d3a2141f32e1fb633aef8c8227f8..e89dfa7b8f682e029bfba1059fda9c17340c420b 100644
--- a/lite/operators/power_op.h
+++ b/lite/operators/power_op.h
@@ -31,7 +31,7 @@ class PowerOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/prior_box_op.cc b/lite/operators/prior_box_op.cc
index c4717c8185b24cfd9f6a551dcb932dc325a502d2..f1b715a46e1378f805d91312cc7804cb4097ec02 100644
--- a/lite/operators/prior_box_op.cc
+++ b/lite/operators/prior_box_op.cc
@@ -27,7 +27,7 @@ bool PriorBoxOpLite::CheckShape() const {
   return true;
 }
 
-bool PriorBoxOpLite::InferShape() const { return true; }
+bool PriorBoxOpLite::InferShapeImpl() const { return true; }
 
 bool PriorBoxOpLite::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
   auto input = opdesc.Input("Input").front();
diff --git a/lite/operators/prior_box_op.h b/lite/operators/prior_box_op.h
index a393e80315eab07cc8558da8c26d6acad8cc76c1..1348b7cc73f6b731453584ef455813fe0d1cf8be 100644
--- a/lite/operators/prior_box_op.h
+++ b/lite/operators/prior_box_op.h
@@ -29,7 +29,7 @@ class PriorBoxOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/range_op.cc b/lite/operators/range_op.cc
index a179d8ffe7abc1665b13f7d0dfeaa8b3c18cf1d5..19f474ba43b15153a7e2cca38f5ff9b097b41342 100644
--- a/lite/operators/range_op.cc
+++ b/lite/operators/range_op.cc
@@ -41,7 +41,7 @@ void GetSize(T start, T end, T step, int64_t* size) {
               : std::ceil(std::abs((end - start) / step));
 }
 
-bool RangeOpLite::InferShape() const {
+bool RangeOpLite::InferShapeImpl() const {
   int start = param_.Start->data<float>()[0];
   int end = param_.End->data<float>()[0];
   int step = param_.Step->data<float>()[0];
diff --git a/lite/operators/range_op.h b/lite/operators/range_op.h
index a1c7d4d4cc43d72001ac3519cb1c4f85ab8196ff..982ef5abf25aac816c00da918147bac8933424a9 100644
--- a/lite/operators/range_op.h
+++ b/lite/operators/range_op.h
@@ -29,7 +29,7 @@ class RangeOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/read_from_array_op.cc b/lite/operators/read_from_array_op.cc
index ffc7727eb8d7dc277c5399b42f84b55e983a4a47..495fd752c90da528e474b7aa726c65fd6e66c123 100644
--- a/lite/operators/read_from_array_op.cc
+++ b/lite/operators/read_from_array_op.cc
@@ -19,11 +19,17 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
-bool ReadFromArrayOp::CheckShape() const { return true; }
+bool ReadFromArrayOp::CheckShape() const {
+  CHECK(param_.X);
+  CHECK(param_.I);
+  CHECK(param_.Out);
+  return true;
+}
 
-bool ReadFromArrayOp::InferShape() const {
-  auto in_dims = (*param_.X)[0].dims();
-  param_.Out->Resize(in_dims);
+bool ReadFromArrayOp::InferShapeImpl() const {
+  int id = param_.I->data<int64_t>()[0];
+  auto out_dims = (*param_.X)[id].dims();
+  param_.Out->Resize(out_dims);
   return true;
 }
 
@@ -32,11 +38,9 @@ bool ReadFromArrayOp::AttachImpl(const cpp::OpDesc &opdesc,
   auto in = opdesc.Input("X").front();
   param_.X = scope->FindVar(in)->GetMutable<std::vector<lite::Tensor>>();
 
-  param_.I =
-      scope->FindVar(opdesc.Input("I").front())->GetMutable<lite::Tensor>();
+  param_.I = scope->FindTensor(opdesc.Input("I").front());
 
-  param_.Out =
-      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+  param_.Out = scope->FindMutableTensor(opdesc.Output("Out").front());
   return true;
 }
 
diff --git a/lite/operators/read_from_array_op.h b/lite/operators/read_from_array_op.h
index 5c7ba1468f59e27a273b368014c707676c48e36a..299a3abaedcf3618f5e28a9636d427961a97b931 100644
--- a/lite/operators/read_from_array_op.h
+++ b/lite/operators/read_from_array_op.h
@@ -30,7 +30,7 @@ class ReadFromArrayOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/reduce_max_op.cc b/lite/operators/reduce_max_op.cc
index d7d90ee1f454556baee1a87cfd0023f8cf8c119d..ba48acd11f3517f33b020ede92e07cfadc5d497b 100644
--- a/lite/operators/reduce_max_op.cc
+++ b/lite/operators/reduce_max_op.cc
@@ -39,7 +39,7 @@ bool ReduceMaxOp::CheckShape() const {
   return true;
 }
 
-bool ReduceMaxOp::InferShape() const {
+bool ReduceMaxOp::InferShapeImpl() const {
   auto dims = param_.dim;
   auto x_dims = param_.X->dims();
   bool reduce_all = false;
diff --git a/lite/operators/reduce_max_op.h b/lite/operators/reduce_max_op.h
index 60e263f1b9b72a31c223cc60f89a7ddf81949e8c..54b136a7576fb2bb078c5bcae727b15d319bdf8e 100644
--- a/lite/operators/reduce_max_op.h
+++ b/lite/operators/reduce_max_op.h
@@ -28,7 +28,7 @@ class ReduceMaxOp : public OpLite {
   ReduceMaxOp() {}
   explicit ReduceMaxOp(const std::string &op_type) : OpLite(op_type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
diff --git a/lite/operators/reduce_mean_op.cc b/lite/operators/reduce_mean_op.cc
index bce31c315c22e93d7758a05ecf2ace0668dd0cc1..c5baca5e87068d267ada21854b7769bf2bc19461 100644
--- a/lite/operators/reduce_mean_op.cc
+++ b/lite/operators/reduce_mean_op.cc
@@ -39,7 +39,7 @@ bool ReduceMeanOp::CheckShape() const {
   return true;
 }
 
-bool ReduceMeanOp::InferShape() const {
+bool ReduceMeanOp::InferShapeImpl() const {
   auto dims = param_.dim;
   auto x_dims = param_.X->dims();
   bool reduce_all = false;
diff --git a/lite/operators/reduce_mean_op.h b/lite/operators/reduce_mean_op.h
index e701a1132aa1260b5f169f89dec546a0d80fc916..43fe955690b3e4569f75c88a4d7b9ba9e961fcca 100644
--- a/lite/operators/reduce_mean_op.h
+++ b/lite/operators/reduce_mean_op.h
@@ -28,7 +28,7 @@ class ReduceMeanOp : public OpLite {
   ReduceMeanOp() {}
   explicit ReduceMeanOp(const std::string &op_type) : OpLite(op_type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
diff --git a/lite/operators/reduce_ops.cc b/lite/operators/reduce_ops.cc
index e2cc56b416dd166e6b22a0c642907844ab964cc5..1af6daf8c73e8e41f69be8f8af8f485ac767d702 100644
--- a/lite/operators/reduce_ops.cc
+++ b/lite/operators/reduce_ops.cc
@@ -28,7 +28,7 @@ bool ReduceOp::CheckShape() const {
   return true;
 }
 
-bool ReduceOp::InferShape() const {
+bool ReduceOp::InferShapeImpl() const {
   const auto &x_dims = param_.x->dims();
   auto x_rank = x_dims.size();
   auto dims = param_.dim;
diff --git a/lite/operators/reduce_ops.h b/lite/operators/reduce_ops.h
index 0063aba1fa606c6228e7dcb1197bfb36f57aa33c..d4fdbd113586a57b0d5a1e6e5fbde6707efb7cc1 100644
--- a/lite/operators/reduce_ops.h
+++ b/lite/operators/reduce_ops.h
@@ -30,7 +30,7 @@ class ReduceOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/reduce_prod_op.cc b/lite/operators/reduce_prod_op.cc
index 90da13c8643fa030c376ca25cb3a67b70f3485a4..5a6194b36b9c0b4a95fb47049999da093f979e3b 100644
--- a/lite/operators/reduce_prod_op.cc
+++ b/lite/operators/reduce_prod_op.cc
@@ -28,7 +28,7 @@ bool ReduceProdOpLite::CheckShape() const {
   return true;
 }
 
-bool ReduceProdOpLite::InferShape() const {
+bool ReduceProdOpLite::InferShapeImpl() const {
   auto x = param_.x;
   auto out = param_.output;
   std::vector<int> dim = param_.dim;
diff --git a/lite/operators/reduce_prod_op.h b/lite/operators/reduce_prod_op.h
index 5f7a6dcdf98eb99d9145b7e3108972f4debeaeb5..d8bb1400b9aecf449499d4c6920c2ef88eb119b2 100644
--- a/lite/operators/reduce_prod_op.h
+++ b/lite/operators/reduce_prod_op.h
@@ -29,7 +29,7 @@ class ReduceProdOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/relu_op.cc b/lite/operators/relu_op.cc
index 9fa3ac8f30784b8349788dfd4eaf39252db1a156..e5f51676c69bcde6b68a9e9d17f936874a5ea86f 100644
--- a/lite/operators/relu_op.cc
+++ b/lite/operators/relu_op.cc
@@ -20,7 +20,7 @@ namespace lite {
 namespace operators {
 
 bool ReluOp::CheckShape() const { return true; }
-bool ReluOp::InferShape() const {
+bool ReluOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.X);
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
diff --git a/lite/operators/relu_op.h b/lite/operators/relu_op.h
index 23ca7ff16b48de747069f006cddbb9504e6942e3..7577f2ffbab62298138b22970c00caf9ab01367f 100644
--- a/lite/operators/relu_op.h
+++ b/lite/operators/relu_op.h
@@ -30,7 +30,7 @@ class ReluOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/reshape_op.cc b/lite/operators/reshape_op.cc
index 655ac58bdcbfc0f8d9cdbb0ef0078db5eb0333fa..5c55eb4aa516ae3aecf49250f42d38491c1270f1 100644
--- a/lite/operators/reshape_op.cc
+++ b/lite/operators/reshape_op.cc
@@ -26,7 +26,7 @@ bool ReshapeOp::CheckShape() const {
   return true;
 }
 
-bool ReshapeOp::InferShape() const {
+bool ReshapeOp::InferShapeImpl() const {
   const auto &shape_tensor_vct = param_.shape_tensor_vct;
   auto *shape_tensor = param_.shape_tensor;
   const auto &shape_vct = param_.shape_vct;
@@ -97,8 +97,8 @@ bool Reshape2Op::CheckShape() const {
   return true;
 }
 
-bool Reshape2Op::InferShape() const {
-  ReshapeOp::InferShape();
+bool Reshape2Op::InferShapeImpl() const {
+  ReshapeOp::InferShapeImpl();
   const auto &x_dims = param_.x->dims();
   std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1);
   xshape_dims[0] = 0;
diff --git a/lite/operators/reshape_op.h b/lite/operators/reshape_op.h
index 1df49fb5f44c88978b78f17885a5ba4412aa9ab7..9dc302ec9706512b16cd9e7db38b944d2d1324f5 100644
--- a/lite/operators/reshape_op.h
+++ b/lite/operators/reshape_op.h
@@ -30,7 +30,7 @@ class ReshapeOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
@@ -48,7 +48,7 @@ class Reshape2Op : public ReshapeOp {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/roi_align_op.cc b/lite/operators/roi_align_op.cc
index 2f65c0197ecf1324678c63b6bd16018f83389702..001934dcf8f77527666c1b5cc0a01afcade2af81 100644
--- a/lite/operators/roi_align_op.cc
+++ b/lite/operators/roi_align_op.cc
@@ -38,7 +38,7 @@ bool RoiAlignOpLite::CheckShape() const {
   return true;
 }
 
-bool RoiAlignOpLite::InferShape() const {
+bool RoiAlignOpLite::InferShapeImpl() const {
   auto x_dims = param_.X->dims();
   auto rois_dims = param_.ROIs->dims();
 
diff --git a/lite/operators/roi_align_op.h b/lite/operators/roi_align_op.h
index f3dd1a47f5e2d0dbb39439c9789573b9b7a33728..65cc72534a2e2b63a1e024a55c766f2c1983f5ab 100644
--- a/lite/operators/roi_align_op.h
+++ b/lite/operators/roi_align_op.h
@@ -31,7 +31,7 @@ class RoiAlignOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/scale_op.cc b/lite/operators/scale_op.cc
index 1398ea481194cae545fc8f1fa803eff5f5b78a31..3236277187462dd1185e698e5cb8fe919fe20b97 100644
--- a/lite/operators/scale_op.cc
+++ b/lite/operators/scale_op.cc
@@ -24,7 +24,7 @@ bool ScaleOp::CheckShape() const {
   return true;
 }
 
-bool ScaleOp::InferShape() const {
+bool ScaleOp::InferShapeImpl() const {
   param_.output->Resize(param_.x->dims());
   return true;
 }
diff --git a/lite/operators/scale_op.h b/lite/operators/scale_op.h
index 684da4ed47370090c5cb690ea728fa4f9147c4bf..38970bfcfd82eebce51612e6afb531cbf3b10966 100644
--- a/lite/operators/scale_op.h
+++ b/lite/operators/scale_op.h
@@ -30,7 +30,7 @@ class ScaleOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/search_aligned_mat_mul_op.cc b/lite/operators/search_aligned_mat_mul_op.cc
index 43a276e3c7a2f7481ade2ee18c1446593f7c5f43..65ccbc2b793cb3a64c16a5b3bf7d869d8e271327 100644
--- a/lite/operators/search_aligned_mat_mul_op.cc
+++ b/lite/operators/search_aligned_mat_mul_op.cc
@@ -27,7 +27,7 @@ bool SearchAlignedMatMulOpLite::CheckShape() const {
   return true;
 }
 
-bool SearchAlignedMatMulOpLite::InferShape() const {
+bool SearchAlignedMatMulOpLite::InferShapeImpl() const {
   const auto x_dims = param_.X->dims();
   const auto y_dims = param_.Y->dims();
   const auto& x_lod = param_.X->lod();
diff --git a/lite/operators/search_aligned_mat_mul_op.h b/lite/operators/search_aligned_mat_mul_op.h
index 7321b7e9d15331e6aad36364436a99d3d4089c8c..8242e06d0170a8a4c178f0e460c64f93b0c2bc3c 100644
--- a/lite/operators/search_aligned_mat_mul_op.h
+++ b/lite/operators/search_aligned_mat_mul_op.h
@@ -31,7 +31,7 @@ class SearchAlignedMatMulOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
diff --git a/lite/operators/search_fc_op.cc b/lite/operators/search_fc_op.cc
index 2e77e361624e681aa93e36610674df0e1f9a13af..3c64f24e48f750b367b75431333401329721a9b9 100644
--- a/lite/operators/search_fc_op.cc
+++ b/lite/operators/search_fc_op.cc
@@ -50,7 +50,7 @@ bool SearchFcOpLite::CheckShape() const {
   return true;
 }
 
-bool SearchFcOpLite::InferShape() const {
+bool SearchFcOpLite::InferShapeImpl() const {
   auto out_size = param_.out_size;
   lite::DDim dims(std::vector<int64_t>({-1, out_size}));
   param_.Out->Resize(dims);
diff --git a/lite/operators/search_fc_op.h b/lite/operators/search_fc_op.h
index a871cadd33b4f7d4b6130a0b8ac2974a738ac0c3..235c24c57ff0e925d763fa11a78f56cfe72613cd 100644
--- a/lite/operators/search_fc_op.h
+++ b/lite/operators/search_fc_op.h
@@ -30,7 +30,7 @@ class SearchFcOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/search_grnn_op.cc b/lite/operators/search_grnn_op.cc
index b56ae820bf9de4ffe6aa3f6db7a8e1385c8cc11f..1ced477c109d8cd93485f0193523887759939f17 100644
--- a/lite/operators/search_grnn_op.cc
+++ b/lite/operators/search_grnn_op.cc
@@ -51,7 +51,7 @@ bool SearchGrnnOpLite::CheckShape() const {
   return true;
 }
 
-bool SearchGrnnOpLite::InferShape() const {
+bool SearchGrnnOpLite::InferShapeImpl() const {
   const auto& x_dims = param_.x->dims();
   const auto& x_lod = param_.x->lod();
   CHECK_OR_FALSE(!x_lod.empty());
diff --git a/lite/operators/search_grnn_op.h b/lite/operators/search_grnn_op.h
index 670af8a6c9ff9eafa33018a0303ea1a36b0a1e01..de4b1d8a5c4d551970fcbb7b0c17de67214b5c9a 100644
--- a/lite/operators/search_grnn_op.h
+++ b/lite/operators/search_grnn_op.h
@@ -31,7 +31,7 @@ class SearchGrnnOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/search_group_padding_op.cc b/lite/operators/search_group_padding_op.cc
index 5ba4dde275f4b9662416bdf5190cacfafc56a40d..b97c710109ea9eb1ae3b1e50e3bdab3e1e97ac3e 100644
--- a/lite/operators/search_group_padding_op.cc
+++ b/lite/operators/search_group_padding_op.cc
@@ -31,7 +31,7 @@ bool SearchGroupPaddingOp::CheckShape() const {
   return true;
 }
 
-bool SearchGroupPaddingOp::InferShape() const {
+bool SearchGroupPaddingOp::InferShapeImpl() const {
   std::vector<int64_t> x_dims = param_.x->dims().Vectorize();
 
   param_.out_emb_padding->Resize({-1, x_dims[1]});
diff --git a/lite/operators/search_group_padding_op.h b/lite/operators/search_group_padding_op.h
index a8e96c9697b5f7de70349efa1f8b378a47c3823c..6a93c7410128aa86b034308562b8c3ccd4ca78df 100644
--- a/lite/operators/search_group_padding_op.h
+++ b/lite/operators/search_group_padding_op.h
@@ -27,7 +27,7 @@ class SearchGroupPaddingOp : public OpLite {
   SearchGroupPaddingOp() {}
   explicit SearchGroupPaddingOp(const std::string &op_type) : OpLite(op_type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
   std::string DebugString() const override { return "search_group_padding"; }
diff --git a/lite/operators/search_seq_depadding_op.cc b/lite/operators/search_seq_depadding_op.cc
index 12d5123e05b41665550fb7e6b90a636093959263..6ad4f1ab171486468bf34b8341344410ed99f59b 100644
--- a/lite/operators/search_seq_depadding_op.cc
+++ b/lite/operators/search_seq_depadding_op.cc
@@ -44,7 +44,7 @@ bool SearchSeqDepaddingOpLite::CheckShape() const {
   return true;
 }
 
-bool SearchSeqDepaddingOpLite::InferShape() const {
+bool SearchSeqDepaddingOpLite::InferShapeImpl() const {
   DDim pad_dims = param_.pad->dims();
   param_.out->Resize({-1, pad_dims[1]});
   return true;
diff --git a/lite/operators/search_seq_depadding_op.h b/lite/operators/search_seq_depadding_op.h
index 445d9e0f3bcba6204243e80023d826bf53d90c60..aa1cc22d4b048ca81445e735e09226b7dfe2fd03 100644
--- a/lite/operators/search_seq_depadding_op.h
+++ b/lite/operators/search_seq_depadding_op.h
@@ -32,7 +32,7 @@ class SearchSeqDepaddingOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/search_seq_fc_op.cc b/lite/operators/search_seq_fc_op.cc
index c5cca5331ab80479656b1212df02c20d463a3707..2a4525ac6e6f7e0cdd62a0a653e7188b274545af 100644
--- a/lite/operators/search_seq_fc_op.cc
+++ b/lite/operators/search_seq_fc_op.cc
@@ -26,7 +26,7 @@ bool SearchSeqFcOpLite::CheckShape() const {
   return true;
 }
 
-bool SearchSeqFcOpLite::InferShape() const {
+bool SearchSeqFcOpLite::InferShapeImpl() const {
   const auto x_dims = param_.x->dims();
   const auto w_dims = param_.w->dims();
   const auto& x_lod = param_.x->lod();
diff --git a/lite/operators/search_seq_fc_op.h b/lite/operators/search_seq_fc_op.h
index 3c4f7d82bfa66c2f323063f0297438c81ce18397..bacafcfe6ffa2a2c518cf3b8f226fa29c9b95e95 100644
--- a/lite/operators/search_seq_fc_op.h
+++ b/lite/operators/search_seq_fc_op.h
@@ -31,7 +31,7 @@ class SearchSeqFcOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
diff --git a/lite/operators/search_seq_softmax_op.cc b/lite/operators/search_seq_softmax_op.cc
index 973ffa04c4562334af6d379b5446902036de8c5e..9b0550341c50df9cd48fa922139fc759c5289e97 100644
--- a/lite/operators/search_seq_softmax_op.cc
+++ b/lite/operators/search_seq_softmax_op.cc
@@ -25,7 +25,7 @@ bool SearchSeqSoftmaxOp::CheckShape() const {
   return true;
 }
 
-bool SearchSeqSoftmaxOp::InferShape() const {
+bool SearchSeqSoftmaxOp::InferShapeImpl() const {
   param_.output->Resize(param_.x->dims());
   param_.output->set_lod(param_.x->lod());
   return true;
diff --git a/lite/operators/search_seq_softmax_op.h b/lite/operators/search_seq_softmax_op.h
index f97e8ddd3a6c446fb5c53d5e603f43bbdf1e2525..dca3619eab9013f22d962b16c577c73862ee5e64 100644
--- a/lite/operators/search_seq_softmax_op.h
+++ b/lite/operators/search_seq_softmax_op.h
@@ -31,7 +31,7 @@ class SearchSeqSoftmaxOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/sequence_arithmetic_op.cc b/lite/operators/sequence_arithmetic_op.cc
index 29c39ebc23f54c2c3c052e322575d97570195cfc..e17a179a860e13622979e5b42b07ae3459876fc7 100644
--- a/lite/operators/sequence_arithmetic_op.cc
+++ b/lite/operators/sequence_arithmetic_op.cc
@@ -28,7 +28,7 @@ bool SequenceArithmeticOp::CheckShape() const {
   return true;
 }
 
-bool SequenceArithmeticOp::InferShape() const {
+bool SequenceArithmeticOp::InferShapeImpl() const {
   param_.Out->Resize(param_.X->dims());
   param_.Out->set_lod(param_.X->lod());
   return true;
diff --git a/lite/operators/sequence_arithmetic_op.h b/lite/operators/sequence_arithmetic_op.h
index 9f844dfbf429599d829bc786c66ba6d05e40d79d..cf9ef1583aeaed977c515441ca629b2e66efb3d2 100644
--- a/lite/operators/sequence_arithmetic_op.h
+++ b/lite/operators/sequence_arithmetic_op.h
@@ -29,7 +29,7 @@ class SequenceArithmeticOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/sequence_concat_op.cc b/lite/operators/sequence_concat_op.cc
index 88afe5e00fe2bfc173a8a1d1d0e63562cfb52518..91c70c0d2ff2d506d29dbeb01780de962f9a27f1 100644
--- a/lite/operators/sequence_concat_op.cc
+++ b/lite/operators/sequence_concat_op.cc
@@ -26,7 +26,7 @@ bool SequenceConcatOp::CheckShape() const {
   return true;
 }
 
-bool SequenceConcatOp::InferShape() const { return true; }
+bool SequenceConcatOp::InferShapeImpl() const { return true; }
 
 bool SequenceConcatOp::AttachImpl(const cpp::OpDesc &opdesc,
                                   lite::Scope *scope) {
diff --git a/lite/operators/sequence_concat_op.h b/lite/operators/sequence_concat_op.h
index 8cdc07ebca83b9c400b00a0f40556a788c5854e6..c7d61db7852fb8894c5c4ed7c3d4283480c90e48 100644
--- a/lite/operators/sequence_concat_op.h
+++ b/lite/operators/sequence_concat_op.h
@@ -27,7 +27,7 @@ class SequenceConcatOp : public OpLite {
   SequenceConcatOp() {}
   explicit SequenceConcatOp(const std::string &op_type) : OpLite(op_type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
   std::string DebugString() const override { return "sequence_concat"; }
diff --git a/lite/operators/sequence_conv_op.cc b/lite/operators/sequence_conv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..681e05c9b69953c4dde6c873e66bee2e93839aaf
--- /dev/null
+++ b/lite/operators/sequence_conv_op.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sequence_conv_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SequenceConvOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Filter);
+  CHECK_OR_FALSE(param_.Out);
+
+  // currently we only support the case that
+  // the contextStride is equal to 1
+  int context_length = param_.contextLength;
+  int context_start = param_.contextStart;
+  CHECK_EQ_OR_FALSE(param_.contextStride, 1UL);
+  CHECK_GT_OR_FALSE(context_start, -context_length);
+  CHECK_GE_OR_FALSE(0, context_start);
+
+  const auto *filter = param_.Filter;
+  auto lod = param_.X->lod();
+  auto filter_dims = filter->dims();
+  auto in_dims = param_.X->dims();
+  CHECK_EQ_OR_FALSE(in_dims.size(), 2UL);
+  CHECK_EQ_OR_FALSE(filter_dims.size(), 2UL);
+  CHECK_EQ_OR_FALSE(lod.size(), 1UL);
+  CHECK_EQ_OR_FALSE(filter_dims[0], context_length * in_dims[1]);
+  CHECK_GE_OR_FALSE(in_dims[0], (static_cast<int64_t>(lod[0].size()) - 1));
+  return true;
+}
+
+bool SequenceConvOp::InferShapeImpl() const {
+  const auto *input = param_.X;
+  const auto *filter = param_.Filter;
+  auto in_dims = input->dims();
+  auto filter_dims = filter->dims();
+  auto out_dims = in_dims;
+  out_dims[1] = filter_dims[1];
+  param_.Out->Resize(out_dims);
+  param_.Out->set_lod(param_.X->lod());
+  return true;
+}
+
+bool SequenceConvOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  // required params
+  param_.X = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
+  param_.Filter = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("Filter").front())->Get<lite::Tensor>());
+  param_.Out =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+  param_.contextStart = opdesc.GetAttr<int>("contextStart");
+  param_.contextStride = opdesc.GetAttr<int>("contextStride");
+  param_.contextLength = opdesc.GetAttr<int>("contextLength");
+
+  // PaddingData is not supported for now
+  std::vector<std::string> input_arg_names = opdesc.InputArgumentNames();
+  if (std::find(input_arg_names.begin(),
+                input_arg_names.end(),
+                "PaddingData") != input_arg_names.end()) {
+    auto padding_data_arguments = opdesc.Input("PaddingData");
+    CHECK_EQ_OR_FALSE(padding_data_arguments.size(), 0);
+  }
+
+  // paddingTrainable == True is not supported for now.
+  if (opdesc.HasAttr("paddingTrainable")) {
+    CHECK_OR_FALSE(!opdesc.GetAttr<bool>("paddingTrainable"));
+  }
+  CHECK(param_.X);
+  CHECK(param_.Filter);
+  CHECK(param_.Out);
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(sequence_conv, paddle::lite::operators::SequenceConvOp);
diff --git a/lite/operators/sequence_conv_op.h b/lite/operators/sequence_conv_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ec7ac4d3da7822335e047ca1c681809914c192b
--- /dev/null
+++ b/lite/operators/sequence_conv_op.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SequenceConvOp : public OpLite {
+ public:
+  SequenceConvOp() {}
+  explicit SequenceConvOp(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShapeImpl() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "sequence_conv"; }
+
+ private:
+  mutable SequenceConvParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/sequence_expand_as_op.cc b/lite/operators/sequence_expand_as_op.cc
index 22a4743103fd4b188357d067a062ea827de7aaa0..02c787b5a51749851de1484101a6339142bc9726 100644
--- a/lite/operators/sequence_expand_as_op.cc
+++ b/lite/operators/sequence_expand_as_op.cc
@@ -34,7 +34,7 @@ bool SequenceExpandAsOpLite::CheckShape() const {
   return true;
 }
 
-bool SequenceExpandAsOpLite::InferShape() const {
+bool SequenceExpandAsOpLite::InferShapeImpl() const {
   auto x_dims = param_.x->dims();
   auto y_lod = param_.y->lod();
   auto out_dims = x_dims;
diff --git a/lite/operators/sequence_expand_as_op.h b/lite/operators/sequence_expand_as_op.h
index 2eae8a26da31eb2937ab88f15d70bd44515e6a5f..19d6905c1a428ce4ac8b2cdb545f194bf47ee62d 100644
--- a/lite/operators/sequence_expand_as_op.h
+++ b/lite/operators/sequence_expand_as_op.h
@@ -31,7 +31,7 @@ class SequenceExpandAsOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/sequence_expand_op.cc b/lite/operators/sequence_expand_op.cc
index 0a5427a62ffca44070c9551a4f1c869ae184f0be..4bb3c66b26673a27a961729d6fe22d54ef9298fe 100644
--- a/lite/operators/sequence_expand_op.cc
+++ b/lite/operators/sequence_expand_op.cc
@@ -40,7 +40,7 @@ bool SequenceExpandOp::CheckShape() const {
   return true;
 }
 
-bool SequenceExpandOp::InferShape() const {
+bool SequenceExpandOp::InferShapeImpl() const {
   const auto x_lod = param_.X->lod();
   auto x_dims = param_.X->dims();
   int ref_level = param_.ref_level;
diff --git a/lite/operators/sequence_expand_op.h b/lite/operators/sequence_expand_op.h
index da4b2fe71edb7f731bf53872960612e16efbef93..fffe2110d871941522e5924943be764e3ee51db5 100644
--- a/lite/operators/sequence_expand_op.h
+++ b/lite/operators/sequence_expand_op.h
@@ -30,7 +30,7 @@ class SequenceExpandOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/sequence_pool_concat_op.cc b/lite/operators/sequence_pool_concat_op.cc
index 9ee0d4d5967e0d36bb893b42033f2c5319c940bb..ce490e8246c621cb23b3a3eecc0e8ddc4bca28b1 100644
--- a/lite/operators/sequence_pool_concat_op.cc
+++ b/lite/operators/sequence_pool_concat_op.cc
@@ -26,7 +26,7 @@ bool SequencePoolConcatOp::CheckShape() const {
   return true;
 }
 
-bool SequencePoolConcatOp::InferShape() const {
+bool SequencePoolConcatOp::InferShapeImpl() const {
   int out_dim = 0;
   for (int i = 0; i < param_.X.size(); ++i) {
     out_dim += param_.X[i]->dims().count(1, param_.X[i]->dims().size());
diff --git a/lite/operators/sequence_pool_concat_op.h b/lite/operators/sequence_pool_concat_op.h
index 7a70ceaf298ebd7d02c319b08a86f40dc36cb648..58e6fc18ba49f6885e1f4ffb86cba47ca86f9623 100644
--- a/lite/operators/sequence_pool_concat_op.h
+++ b/lite/operators/sequence_pool_concat_op.h
@@ -28,7 +28,7 @@ class SequencePoolConcatOp : public OpLite {
   SequencePoolConcatOp() {}
   explicit SequencePoolConcatOp(const std::string &op_type) : OpLite(op_type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
diff --git a/lite/operators/sequence_pool_op.cc b/lite/operators/sequence_pool_op.cc
index be3726ffe7a73c50f92bec2f2a96fb1625e31a9e..6b4f7d8b789f11c815b86f7dcc990e6db7855bbd 100644
--- a/lite/operators/sequence_pool_op.cc
+++ b/lite/operators/sequence_pool_op.cc
@@ -29,7 +29,7 @@ bool SequencePoolOp::CheckShape() const {
   return true;
 }
 
-bool SequencePoolOp::InferShape() const {
+bool SequencePoolOp::InferShapeImpl() const {
   const auto *input = param_.X;
   auto out_dims = input->dims();
   out_dims[0] = input->lod()[0].size() - 1;
diff --git a/lite/operators/sequence_pool_op.h b/lite/operators/sequence_pool_op.h
index 215dd113a3e5d9cdb1707a9b1b70c5712a43ec5d..7b9e36bb5e6e5f47cf49b1bd0df62795b7d57b7e 100644
--- a/lite/operators/sequence_pool_op.h
+++ b/lite/operators/sequence_pool_op.h
@@ -28,7 +28,7 @@ class SequencePoolOp : public OpLite {
   SequencePoolOp() {}
   explicit SequencePoolOp(const std::string &op_type) : OpLite(op_type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
diff --git a/lite/operators/sequence_reshape_op.cc b/lite/operators/sequence_reshape_op.cc
index c7e86af65033205bcb389cecff8db14721507142..37ebd8a2bae3919062bc0e71e3a10193850e7877 100644
--- a/lite/operators/sequence_reshape_op.cc
+++ b/lite/operators/sequence_reshape_op.cc
@@ -27,7 +27,7 @@ bool SequenceReshapeOp::CheckShape() const {
   return true;
 }
 
-bool SequenceReshapeOp::InferShape() const {
+bool SequenceReshapeOp::InferShapeImpl() const {
   int new_dim = param_.new_dim;
   auto x_numel = param_.x->dims().production();
   std::vector<int64_t> out_shape{x_numel / new_dim,
diff --git a/lite/operators/sequence_reshape_op.h b/lite/operators/sequence_reshape_op.h
index c8378aebc44acf22017eee17f5b58d6ff4dd65bf..4ef395bdaa762d178e925f088c5c2becd357f669 100644
--- a/lite/operators/sequence_reshape_op.h
+++ b/lite/operators/sequence_reshape_op.h
@@ -31,7 +31,7 @@ class SequenceReshapeOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/sequence_reverse_op.cc b/lite/operators/sequence_reverse_op.cc
index dd8fa2e8fd5816cc92355c9c73caf1aa76baf36c..19a47cac9da666269fc5ef2a172ff0295b71e95d 100644
--- a/lite/operators/sequence_reverse_op.cc
+++ b/lite/operators/sequence_reverse_op.cc
@@ -30,7 +30,7 @@ bool SequenceReverseOp::CheckShape() const {
   return true;
 }
 
-bool SequenceReverseOp::InferShape() const {
+bool SequenceReverseOp::InferShapeImpl() const {
   const auto *input = param_.X;
   auto out_dims = input->dims();
   param_.Out->Resize(out_dims);
diff --git a/lite/operators/sequence_reverse_op.h b/lite/operators/sequence_reverse_op.h
index 326d0f68927199e9353a5bbe8c072d342c9e3d69..68d9fdb0f16cf0b2e13b7ed7417572a7b971e785 100644
--- a/lite/operators/sequence_reverse_op.h
+++ b/lite/operators/sequence_reverse_op.h
@@ -27,7 +27,7 @@ class SequenceReverseOp : public OpLite {
   SequenceReverseOp() {}
   explicit SequenceReverseOp(const std::string &op_type) : OpLite(op_type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
   std::string DebugString() const override { return "sequence_reverse"; }
diff --git a/lite/operators/sequence_softmax_op.cc b/lite/operators/sequence_softmax_op.cc
index d106097ed5c2e3a712bbd87904164ccd612d1f9e..eb1821129d8b036a252fb36ab69094c8a58cce95 100644
--- a/lite/operators/sequence_softmax_op.cc
+++ b/lite/operators/sequence_softmax_op.cc
@@ -24,7 +24,7 @@ bool SequenceSoftmaxOp::CheckShape() const {
   CHECK_OR_FALSE(param_.Out);
   return true;
 }
-bool SequenceSoftmaxOp::InferShape() const {
+bool SequenceSoftmaxOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   auto input_dims = param_.X->dims();
diff --git a/lite/operators/sequence_softmax_op.h b/lite/operators/sequence_softmax_op.h
index 37dfc0d444be5c608c87c2418041237d4ac4643c..5942cb0441d7af7237c7761fe4ccd5d613321c87 100644
--- a/lite/operators/sequence_softmax_op.h
+++ b/lite/operators/sequence_softmax_op.h
@@ -30,7 +30,7 @@ class SequenceSoftmaxOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/sequence_topk_avg_pooling_op.cc b/lite/operators/sequence_topk_avg_pooling_op.cc
index 6f5cbeeeee5816132d2ebcb7094949189931b931..cb6f12c4b33bfc04beae2574ca384fcd77ac5004 100644
--- a/lite/operators/sequence_topk_avg_pooling_op.cc
+++ b/lite/operators/sequence_topk_avg_pooling_op.cc
@@ -43,7 +43,7 @@ bool SequenceTopkAvgPoolingOpLite::CheckShape() const {
   return true;
 }
 
-bool SequenceTopkAvgPoolingOpLite::InferShape() const {
+bool SequenceTopkAvgPoolingOpLite::InferShapeImpl() const {
   int channel_num = param_.channel_num;
   std::vector<int> topks = param_.topks;
   auto row_dim = param_.ROW->dims();
diff --git a/lite/operators/sequence_topk_avg_pooling_op.h b/lite/operators/sequence_topk_avg_pooling_op.h
index 1c1cfe3a9c7bc82c3e79fc372b98293183509dca..a619edc908a5e4d4a8db97a931acb2ce24e39008 100644
--- a/lite/operators/sequence_topk_avg_pooling_op.h
+++ b/lite/operators/sequence_topk_avg_pooling_op.h
@@ -31,7 +31,7 @@ class SequenceTopkAvgPoolingOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/sgd_op.cc b/lite/operators/sgd_op.cc
index cf387def24ad1356c983a4f335f26e6e68f40ec6..eb8cb6b72473310ca1df12e8510d74cc3d76f4aa 100644
--- a/lite/operators/sgd_op.cc
+++ b/lite/operators/sgd_op.cc
@@ -25,11 +25,12 @@ bool SGDOpLite::CheckShape() const {
   CHECK_OR_FALSE(param_.LearningRate);
   CHECK_OR_FALSE(param_.Grad);
   CHECK_OR_FALSE(param_.ParamOut);
+  CHECK_EQ_OR_FALSE(param_.LearningRate->dims().production(), 1);
+  CHECK_EQ_OR_FALSE(param_.Param->dims(), param_.Grad->dims());
   return true;
 }
 
-bool SGDOpLite::InferShape() const {
-  auto lr_dims = param_.LearningRate->dims().data();
+bool SGDOpLite::InferShapeImpl() const {
   param_.ParamOut->Resize(param_.Param->dims());
   return true;
 }
@@ -38,6 +39,8 @@ bool SGDOpLite::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
   auto Param_name = opdesc.Input("Param").front();
   auto LearningRate_name = opdesc.Input("LearningRate").front();
   auto Grad_name = opdesc.Input("Grad").front();
+  // param_out and param usually have the same name,
+  // and share the same memory
   auto ParamOut_name = opdesc.Output("ParamOut").front();
 
   param_.Param = GetVar<lite::Tensor>(scope, Param_name);
diff --git a/lite/operators/sgd_op.h b/lite/operators/sgd_op.h
index 9159bf95a6a50b5cd7b5d0ffed15e06f8d0e11c5..6a29c8bfa61b455e2257600975e851860e8797cc 100644
--- a/lite/operators/sgd_op.h
+++ b/lite/operators/sgd_op.h
@@ -33,7 +33,7 @@ class SGDOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
diff --git a/lite/operators/shape_op.cc b/lite/operators/shape_op.cc
index c6d5dc4d01a93dd4cc648358db0b6f462a116eb0..1661a909268eb15ea2c4b393e9a2831d438465c7 100644
--- a/lite/operators/shape_op.cc
+++ b/lite/operators/shape_op.cc
@@ -25,7 +25,7 @@ bool ShapeOpLite::CheckShape() const {
   return true;
 }
 
-bool ShapeOpLite::InferShape() const {
+bool ShapeOpLite::InferShapeImpl() const {
   std::vector<int64_t> shape_vec;
   shape_vec.push_back(static_cast<int64_t>(param_.X->dims().size()));
   param_.Out->Resize(shape_vec);
diff --git a/lite/operators/shape_op.h b/lite/operators/shape_op.h
index ada9961c75b1cbc6c91d94a4ed3473ca12d8dcd6..6512b8ac0213519b068a10a74fdcb9d715d73255 100644
--- a/lite/operators/shape_op.h
+++ b/lite/operators/shape_op.h
@@ -28,7 +28,7 @@ class ShapeOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/shuffle_channel_op.cc b/lite/operators/shuffle_channel_op.cc
index 926aa932f3d278945b659b6113df6479c7515e20..d45643a3d82d9177f7719908ea572258e0029bef 100644
--- a/lite/operators/shuffle_channel_op.cc
+++ b/lite/operators/shuffle_channel_op.cc
@@ -27,7 +27,7 @@ bool ShuffleChannelOpLite::CheckShape() const {
   return true;
 }
 
-bool ShuffleChannelOpLite::InferShape() const {
+bool ShuffleChannelOpLite::InferShapeImpl() const {
   param_.Out->Resize(param_.X->dims());
   return true;
 }
diff --git a/lite/operators/shuffle_channel_op.h b/lite/operators/shuffle_channel_op.h
index c48a47f61902087cecf874ee7ddee8313a3cf92a..768345898141dd869c6a59f69170559d68a9f498 100644
--- a/lite/operators/shuffle_channel_op.h
+++ b/lite/operators/shuffle_channel_op.h
@@ -33,7 +33,7 @@ class ShuffleChannelOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
diff --git a/lite/operators/slice_op.cc b/lite/operators/slice_op.cc
index bbc3d1429e202dac7b9a53c00d83ee34de7ef3d1..cf7d94535cce5fa32d0f917c9d39e4746cee1c30 100644
--- a/lite/operators/slice_op.cc
+++ b/lite/operators/slice_op.cc
@@ -27,7 +27,7 @@ bool SliceOp::CheckShape() const {
   return true;
 }
 
-bool SliceOp::InferShape() const {
+bool SliceOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
   auto in_dims = param_.X->dims();
diff --git a/lite/operators/slice_op.h b/lite/operators/slice_op.h
index 936a1405f46ffd9e3375da1cd57b0570b07fcbbf..ec69f23d8ded4a7435bec0a2bd1f838603c7a7be 100644
--- a/lite/operators/slice_op.h
+++ b/lite/operators/slice_op.h
@@ -30,7 +30,7 @@ class SliceOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/softmax_op.cc b/lite/operators/softmax_op.cc
index 1e89fc1a2af407ebbe11f207bd33a1dabb811dc0..000953007c27e37bc05d85d810880f6ccd7728ce 100644
--- a/lite/operators/softmax_op.cc
+++ b/lite/operators/softmax_op.cc
@@ -29,10 +29,11 @@ bool SoftmaxOp::CheckShape() const {
   return true;
 }
 
-bool SoftmaxOp::InferShape() const {
+bool SoftmaxOp::InferShapeImpl() const {
   param_.output->Resize(param_.x->dims());
   auto out_lod = param_.output->mutable_lod();
   *out_lod = param_.x->lod();
+
   return true;
 }
 
diff --git a/lite/operators/softmax_op.h b/lite/operators/softmax_op.h
index bb24acad344f02fe3677484fd2c4c31326683a13..20dc2f461e4f83e0b363d44e07c4204c656f2cf3 100644
--- a/lite/operators/softmax_op.h
+++ b/lite/operators/softmax_op.h
@@ -30,7 +30,7 @@ class SoftmaxOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/split_lod_tensor_op.cc b/lite/operators/split_lod_tensor_op.cc
index 9b665b6026a44caa31b89ec7806188f90f5f1595..2900c8165dba3b8f0b83ef288c89ed0e56b4820d 100644
--- a/lite/operators/split_lod_tensor_op.cc
+++ b/lite/operators/split_lod_tensor_op.cc
@@ -33,7 +33,7 @@ bool SplitLodTensorOpLite::CheckShape() const {
   return true;
 }
 
-bool SplitLodTensorOpLite::InferShape() const {
+bool SplitLodTensorOpLite::InferShapeImpl() const {
   auto x_dims = param_.x->dims();
   param_.out_true->Resize(x_dims);
   param_.out_false->Resize(x_dims);
diff --git a/lite/operators/split_lod_tensor_op.h b/lite/operators/split_lod_tensor_op.h
index c7feef4f85df652d0c24f830076a078e20c111f9..fb7f85de5cae69d3c0844ee0eeabe98d45acde4a 100644
--- a/lite/operators/split_lod_tensor_op.h
+++ b/lite/operators/split_lod_tensor_op.h
@@ -31,7 +31,7 @@ class SplitLodTensorOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/split_op.cc b/lite/operators/split_op.cc
index 834d68a3156700605e621a1ba71faec33fb7b745..71deb5631dd3523ebb0367b7db5e4049b785be7b 100644
--- a/lite/operators/split_op.cc
+++ b/lite/operators/split_op.cc
@@ -29,7 +29,7 @@ bool SplitOp::CheckShape() const {
   return true;
 }
 
-bool SplitOp::InferShape() const {
+bool SplitOp::InferShapeImpl() const {
   const auto &outs = param_.output;
   auto in_dims = param_.x->dims();
   int axis = param_.axis;
diff --git a/lite/operators/split_op.h b/lite/operators/split_op.h
index 66190742155a8268e510d5a8da47ab958a043418..3bb40a8d35e25145057d8c5790b25028ea571cd5 100644
--- a/lite/operators/split_op.h
+++ b/lite/operators/split_op.h
@@ -30,7 +30,7 @@ class SplitOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/squeeze_op.cc b/lite/operators/squeeze_op.cc
index 01f96c28ff6be38e426030aa3c580f28f73b3a38..633a6b4d4e45fd30bd72c8dcdfbbd96b8a8e8ebe 100644
--- a/lite/operators/squeeze_op.cc
+++ b/lite/operators/squeeze_op.cc
@@ -75,7 +75,7 @@ bool SqueezeOp::CheckShape() const {
   return true;
 }
 
-bool SqueezeOp::InferShape() const {
+bool SqueezeOp::InferShapeImpl() const {
   std::vector<int> squeeze_dims = param_.axes;
   DDim in_dims = param_.X->dims();
   DDim out_dim = GetOutputShape(squeeze_dims, in_dims, true);
@@ -105,8 +105,8 @@ bool Squeeze2Op::CheckShape() const {
   return true;
 }
 
-bool Squeeze2Op::InferShape() const {
-  SqueezeOp::InferShape();
+bool Squeeze2Op::InferShapeImpl() const {
+  SqueezeOp::InferShapeImpl();
   auto x_dims = param_.X->dims();
   std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1, 1);
   for (size_t i = 0; i < x_dims.size(); i++) {
diff --git a/lite/operators/squeeze_op.h b/lite/operators/squeeze_op.h
index 1a550c5fbee59d43170b5ffa16caa81521c14d87..983e17acf6483da9e3e33c83b48e6e61455a4914 100644
--- a/lite/operators/squeeze_op.h
+++ b/lite/operators/squeeze_op.h
@@ -30,7 +30,7 @@ class SqueezeOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
@@ -48,7 +48,7 @@ class Squeeze2Op : public SqueezeOp {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/stack_op.cc b/lite/operators/stack_op.cc
index 8fdf61e8224aa06792bdbb3f41a4f1701039d8dd..0f9ba6662b16ce20acad497a4915cfc848b319cd 100644
--- a/lite/operators/stack_op.cc
+++ b/lite/operators/stack_op.cc
@@ -32,7 +32,7 @@ bool StackOp::CheckShape() const {
   return true;
 }
 
-bool StackOp::InferShape() const {
+bool StackOp::InferShapeImpl() const {
   auto input = param_.X;
   auto input_dims = input[0]->dims();
   int axis = param_.axis;
diff --git a/lite/operators/stack_op.h b/lite/operators/stack_op.h
index 068d905338bde892b44630c64d3ec43771614f2a..9ce73057a313fd4b4f96914b3e962120de11ac43 100644
--- a/lite/operators/stack_op.h
+++ b/lite/operators/stack_op.h
@@ -31,7 +31,7 @@ class StackOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/subgraph_op.cc b/lite/operators/subgraph_op.cc
index 58388669afa060d48ea4c3d674dff94c386f104a..9ac07e96334eda9f0001d33e0789f9de15c4ca67 100644
--- a/lite/operators/subgraph_op.cc
+++ b/lite/operators/subgraph_op.cc
@@ -22,7 +22,7 @@ namespace operators {
 
 bool SubgraphOp::CheckShape() const { return true; }
 
-bool SubgraphOp::InferShape() const { return CheckShape(); /* enrich me */ }
+bool SubgraphOp::InferShapeImpl() const { return CheckShape(); /* enrich me */ }
 
 bool SubgraphOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
   param_.input_names = op_desc.Input("Inputs");
diff --git a/lite/operators/subgraph_op.h b/lite/operators/subgraph_op.h
index 7f593159c8651cc18fbea17e559f62297d5022e9..edbfb922044d60165e589d389cd8cfb3b2547796 100644
--- a/lite/operators/subgraph_op.h
+++ b/lite/operators/subgraph_op.h
@@ -35,7 +35,7 @@ class SubgraphOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
 
diff --git a/lite/operators/topk_op.cc b/lite/operators/topk_op.cc
index a15c3c7e41f9b53d3f8996b405a50c5e4005b1dd..4a68cbb4745473b21cc7b6c5f6c8fcef6e186e57 100644
--- a/lite/operators/topk_op.cc
+++ b/lite/operators/topk_op.cc
@@ -20,34 +20,35 @@ namespace operators {
 
 bool TopkOp::CheckShape() const {
   CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Out);
+  CHECK_OR_FALSE(param_.Indices);
   return true;
 }
 
-bool TopkOp::InferShape() const {
+bool TopkOp::InferShapeImpl() const {
   auto out_dims = param_.X->dims();
   out_dims[out_dims.size() - 1] = param_.K;
   auto out = param_.Out;
   out->Resize(out_dims);
-  auto out_lod = out->mutable_lod();
-  *out_lod = param_.X->lod();
-  auto ind = param_.Indices;
-  ind->Resize(out_dims);
-  auto ind_lod = out->mutable_lod();
-  *ind_lod = param_.X->lod();
+  out->set_lod(param_.X->lod());
+
+  auto indices = param_.Indices;
+  indices->Resize(out_dims);
+  indices->set_lod(param_.X->lod());
+
   return true;
 }
 
 bool TopkOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
   auto x = op_desc.Input("X").front();
-  param_.X = scope->FindVar(x)->GetMutable<Tensor>();
+  param_.X = scope->FindTensor(x);
 
-  auto outputs0 = op_desc.Output("Out").front();
-  auto outputs1 = op_desc.Output("Indices").front();
-  param_.Out = scope->FindVar(outputs0)->GetMutable<lite::Tensor>();
-  param_.Indices = scope->FindVar(outputs1)->GetMutable<lite::Tensor>();
+  auto output0 = op_desc.Output("Out").front();
+  auto output1 = op_desc.Output("Indices").front();
+  param_.Out = scope->FindMutableTensor(output0);
+  param_.Indices = scope->FindMutableTensor(output1);
   param_.K = op_desc.GetAttr<int>("k");
 
-  CHECK(param_.X);
   CHECK_GE(param_.K, 1) << "topK param is not valid";
   return true;
 }
diff --git a/lite/operators/topk_op.h b/lite/operators/topk_op.h
index 037fa413ea5ce6fcb5eb04502cf232cea7e109e0..d5888e5f1800ba37f4bed61c146b6af75e3f91fc 100644
--- a/lite/operators/topk_op.h
+++ b/lite/operators/topk_op.h
@@ -30,7 +30,7 @@ class TopkOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/transpose_op.cc b/lite/operators/transpose_op.cc
index 71086b492b538e293a1f08ed7f492a46d6eb02f8..40780346d038c875a2eb96b11aff9d1c2a578a2f 100644
--- a/lite/operators/transpose_op.cc
+++ b/lite/operators/transpose_op.cc
@@ -42,7 +42,7 @@ bool TransposeOp::CheckShape() const {
   return true;
 }
 
-bool TransposeOp::InferShape() const {
+bool TransposeOp::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.x);
   CHECK_OR_FALSE(param_.output);
   auto x_dims = param_.x->dims();
@@ -111,7 +111,7 @@ bool Transpose2Op::CheckShape() const {
   return true;
 }
 
-bool Transpose2Op::InferShape() const {
+bool Transpose2Op::InferShapeImpl() const {
   CHECK_OR_FALSE(param_.x);
   CHECK_OR_FALSE(param_.output);
   auto x_dims = param_.x->dims();
diff --git a/lite/operators/transpose_op.h b/lite/operators/transpose_op.h
index ce352a7d82f4a9dd3899f21c252c003c1924dda6..39b75b96d858bb80a51e428b8d7f402258dd9cc1 100644
--- a/lite/operators/transpose_op.h
+++ b/lite/operators/transpose_op.h
@@ -31,7 +31,7 @@ class TransposeOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
@@ -50,7 +50,7 @@ class Transpose2Op : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/uniform_random_op.cc b/lite/operators/uniform_random_op.cc
index 93e74e2b0172e8c3948925f3334b011f37bc097e..512648bfe4acf245286c9be21223520789134897 100644
--- a/lite/operators/uniform_random_op.cc
+++ b/lite/operators/uniform_random_op.cc
@@ -22,7 +22,7 @@ namespace operators {
 
 bool UniformRandomOpLite::CheckShape() const { return true; }
 
-bool UniformRandomOpLite::InferShape() const {
+bool UniformRandomOpLite::InferShapeImpl() const {
   param_.Out->Resize(param_.shape);
   return true;
 }
diff --git a/lite/operators/uniform_random_op.h b/lite/operators/uniform_random_op.h
index f7dde8882f47fc533e0d47dac99acdb431509341..a7890ea3e74afb3fd67f7ba4d1f02861a7e4ae48 100644
--- a/lite/operators/uniform_random_op.h
+++ b/lite/operators/uniform_random_op.h
@@ -33,7 +33,7 @@ class UniformRandomOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
diff --git a/lite/operators/unsqueeze_op.cc b/lite/operators/unsqueeze_op.cc
index 39b275b7b55f79f2c8daf16ab0a6acd2e76e8b48..b5ae90248abb4f2496a4dbca1c12317cf3a7d325 100644
--- a/lite/operators/unsqueeze_op.cc
+++ b/lite/operators/unsqueeze_op.cc
@@ -62,7 +62,7 @@ bool UnsqueezeOp::CheckShape() const {
   return true;
 }
 
-bool UnsqueezeOp::InferShape() const {
+bool UnsqueezeOp::InferShapeImpl() const {
   std::vector<int> final_axes;
   auto axes = param_.axes;
   auto *axes_tensor = param_.axes_tensor;
@@ -129,8 +129,8 @@ bool Unsqueeze2Op::CheckShape() const {
   return true;
 }
 
-bool Unsqueeze2Op::InferShape() const {
-  UnsqueezeOp::InferShape();
+bool Unsqueeze2Op::InferShapeImpl() const {
+  UnsqueezeOp::InferShapeImpl();
   auto x_dims = param_.X->dims();
   std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1, 1);
   for (size_t i = 0; i < x_dims.size(); i++) {
diff --git a/lite/operators/unsqueeze_op.h b/lite/operators/unsqueeze_op.h
index 1e88828c6c5fdef767850909c0dae8ec65e9d1e0..5139b69c63699f041973c3cf31b38d6c7e9fa847 100644
--- a/lite/operators/unsqueeze_op.h
+++ b/lite/operators/unsqueeze_op.h
@@ -30,7 +30,7 @@ class UnsqueezeOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
@@ -48,7 +48,7 @@ class Unsqueeze2Op : public UnsqueezeOp {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/var_conv_2d_op.cc b/lite/operators/var_conv_2d_op.cc
index 51f43c709990d7ac1e664336e252ed684479b783..8cf11f6465d73646ec9bf846cbe6347bdc4b9f5b 100644
--- a/lite/operators/var_conv_2d_op.cc
+++ b/lite/operators/var_conv_2d_op.cc
@@ -21,7 +21,7 @@ namespace operators {
 
 bool VarConv2dOp::CheckShape() const { return true; }
 
-bool VarConv2dOp::InferShape() const { return true; }
+bool VarConv2dOp::InferShapeImpl() const { return true; }
 
 bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   param_.X = const_cast<lite::Tensor *>(
diff --git a/lite/operators/var_conv_2d_op.h b/lite/operators/var_conv_2d_op.h
index ce6309419cc582c2f93250dd6e8e59c04a951f91..5fa492d28ec858426bea7d3d06598813d94dbbb8 100644
--- a/lite/operators/var_conv_2d_op.h
+++ b/lite/operators/var_conv_2d_op.h
@@ -27,7 +27,7 @@ class VarConv2dOp : public OpLite {
   VarConv2dOp() {}
   explicit VarConv2dOp(const std::string &op_type) : OpLite(op_type) {}
   bool CheckShape() const override;
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
   std::string DebugString() const override { return "var_conv_2d"; }
diff --git a/lite/operators/while_op.cc b/lite/operators/while_op.cc
index dba266af770183698680a49cb7ba4fe5dda2f5b2..1dcf9553f331ee6646ad6d93de048728a0886116 100644
--- a/lite/operators/while_op.cc
+++ b/lite/operators/while_op.cc
@@ -27,7 +27,7 @@ bool WhileOpLite::CheckShape() const {
   return true;
 }
 
-bool WhileOpLite::InferShape() const { return true; }
+bool WhileOpLite::InferShapeImpl() const { return true; }
 
 bool WhileOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
   auto inputs = op_desc.Input("X");
diff --git a/lite/operators/while_op.h b/lite/operators/while_op.h
index fcba722dbc182d0de617c3bf397a0266dc3d9cb2..94aec15a6d3eb60036bf9c2168fdbd855b84a396 100644
--- a/lite/operators/while_op.h
+++ b/lite/operators/while_op.h
@@ -30,7 +30,7 @@ class WhileOpLite : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/write_to_array_op.cc b/lite/operators/write_to_array_op.cc
index 25e839f1360cd693642b10d96ab27389dfb2cfcb..d2cf7b4f94513d1058c3b4f4de1ec70c8c244b7e 100644
--- a/lite/operators/write_to_array_op.cc
+++ b/lite/operators/write_to_array_op.cc
@@ -19,25 +19,30 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
-bool WriteToArrayOp::CheckShape() const { return true; }
+bool WriteToArrayOp::CheckShape() const {
+  CHECK(param_.X);
+  CHECK(param_.I);
+  CHECK(param_.Out);
+  return true;
+}
 
-bool WriteToArrayOp::InferShape() const {
-  auto in_dims = param_.X->dims();
-  for (auto out : *param_.Out) {
-    out.Resize(in_dims);
+bool WriteToArrayOp::InferShapeImpl() const {
+  int id = param_.I->data<int64_t>()[0];
+  if (param_.Out->size() < id + 1) {
+    param_.Out->resize(id + 1);
   }
   return true;
 }
 
 bool WriteToArrayOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   auto inputs = opdesc.Input("X").front();
-  param_.X = scope->FindVar(inputs)->GetMutable<lite::Tensor>();
+  param_.X = scope->FindTensor(inputs);
 
   auto id = opdesc.Input("I").front();
-  param_.I = scope->FindVar(id)->GetMutable<lite::Tensor>();
+  param_.I = scope->FindTensor(id);
 
   auto out = opdesc.Output("Out").front();
-  param_.Out = scope->FindVar(out)->GetMutable<std::vector<lite::Tensor>>();
+  param_.Out = scope->FindVar(out)->GetMutable<std::vector<Tensor>>();
   return true;
 }
 
diff --git a/lite/operators/write_to_array_op.h b/lite/operators/write_to_array_op.h
index 8c987a24509d915d2ec59b90808993abe779623e..9460b7e364047750991d03468956462497fc4cc1 100644
--- a/lite/operators/write_to_array_op.h
+++ b/lite/operators/write_to_array_op.h
@@ -30,7 +30,7 @@ class WriteToArrayOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/operators/yolo_box_op.cc b/lite/operators/yolo_box_op.cc
index c8186d3f3182e21856919c46b83fe96a6e2bef93..0a5481a8fb01b5401734beacbc18a0bafcc48457 100644
--- a/lite/operators/yolo_box_op.cc
+++ b/lite/operators/yolo_box_op.cc
@@ -46,7 +46,7 @@ bool YoloBoxOp::CheckShape() const {
   return true;
 }
 
-bool YoloBoxOp::InferShape() const {
+bool YoloBoxOp::InferShapeImpl() const {
   auto* X = param_.X;
   auto anchors = param_.anchors;
   int anchor_num = anchors.size() / 2;
diff --git a/lite/operators/yolo_box_op.h b/lite/operators/yolo_box_op.h
index 2e2ea6d63408ca7d1a1cd7db48b82bf1ced294de..85448000f34bb1f0b768f78bb5929d1a26462043 100644
--- a/lite/operators/yolo_box_op.h
+++ b/lite/operators/yolo_box_op.h
@@ -30,7 +30,7 @@ class YoloBoxOp : public OpLite {
 
   bool CheckShape() const override;
 
-  bool InferShape() const override;
+  bool InferShapeImpl() const override;
 
   bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
 
diff --git a/lite/tests/cv/cv_basic.h b/lite/tests/cv/cv_basic.h
index 92f68543bb15bdc15a8ed029f67ed33ca215361b..5e867487e2e5f75411aae7204dcacd0dd791ee98 100644
--- a/lite/tests/cv/cv_basic.h
+++ b/lite/tests/cv/cv_basic.h
@@ -489,7 +489,7 @@ void image_resize_basic(const uint8_t* in_data,
   int size = srcw * srch;
   if (srcw == dstw && srch == dsth) {
     if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
-      size = srcw * (ceil(1.5 * srch));
+      size = srcw * (static_cast<int>(1.5 * srch));
     } else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
       size = 3 * srcw * srch;
     } else if (srcFormat == ImageFormat::BGRA ||
@@ -499,23 +499,23 @@ void image_resize_basic(const uint8_t* in_data,
     memcpy(out_data, in_data, sizeof(uint8_t) * size);
     return;
   }
-  double scale_x = static_cast<double>(srcw / dstw);
-  double scale_y = static_cast<double>(srch / dsth);
+  double scale_x = static_cast<double>(srcw) / dstw;
+  double scale_y = static_cast<double>(srch) / dsth;
 
   int* buf = new int[dstw + dsth];
 
   int* xofs = buf;
   int* yofs = buf + dstw;
   float* ialpha = new float[dstw * 2];
-  float* ibeta = new float[dsth * 2];
+  float* ibeta = new float[dsth * 3];
 
   int w_in = srcw;
   int w_out = dstw;
   int num = 1;
   int orih = dsth;
+
   compute_xy(
       srcw, srch, dstw, dsth, scale_x, scale_y, xofs, yofs, ialpha, ibeta);
-
   if (srcFormat == ImageFormat::GRAY) {
     num = 1;
   } else if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
@@ -538,10 +538,10 @@ void image_resize_basic(const uint8_t* in_data,
   int* yofs1 = nullptr;
   if (orih < dsth) {
     int tmp = dsth - orih;
-    float* ialpha1 = new float[dstw];
-    int* xofs1 = new int[srcw];
-    int* yofs1 = new int[tmp];
-    compute_xy(srcw / 2,
+    ialpha1 = new float[dstw];
+    xofs1 = new int[dstw];
+    yofs1 = new int[tmp];
+    compute_xy(srcw,
                srch / 2,
                dstw / 2,
                tmp,
@@ -550,18 +550,14 @@ void image_resize_basic(const uint8_t* in_data,
                xofs1,
                yofs1,
                ialpha1,
-               ibeta + dsth);
+               ibeta + orih * 2);
   }
 #pragma omp parallel for
   for (int dy = 0; dy < dsth; dy++) {
     uint8_t* out_ptr = out_data + dy * w_out;
     int y_in_start = yofs[dy];
-    int y_in_end = y_in_start + 1;
-    int y_flag = 0;  // only one line
-    if (y_in_start < 0) {
-      y_flag = 1;
-      y_in_end = 0;
-    }
+    int y_flag = 0;
+
     float b0 = ibeta[dy * 2];
     float b1 = ibeta[dy * 2 + 1];
     if (dy >= orih) {
@@ -569,6 +565,12 @@ void image_resize_basic(const uint8_t* in_data,
       ialpha = ialpha1;
       xofs = xofs1;
       yofs = yofs1;
+      y_in_start = yofs[dy - orih] + srch;
+    }
+    int y_in_end = y_in_start + 1;
+    if (y_in_start < 0) {
+      y_flag = 1;
+      y_in_end = 0;
     }
     for (int dx = 0; dx < w_out; dx += num) {
       int tmp = dx / num;
@@ -579,7 +581,6 @@ void image_resize_basic(const uint8_t* in_data,
         x_flag = 1;
         x_in_end = 0;
       }
-      // printf("x_in: %d, y_in: %d \n", x_in_start, y_in_start);
       float a0 = ialpha[tmp * 2];
       float a1 = ialpha[tmp * 2 + 1];
       int tl_index = y_in_start * w_in + x_in_start;  // 0
@@ -605,9 +606,6 @@ void image_resize_basic(const uint8_t* in_data,
         bl_index++;
         br_index++;
         float outval = (tl * a0 + tr * a1) * b0 + (bl * a0 + br * a1) * b1;
-        // printf("tl: %d, tr: %d, bl: %d, br: %d \n", tl, tr, bl, br);
-        // printf("br_index: %d, a0: %f, b1: %f, out: %f \n", br_index, a0, b1,
-        // outval);
         out_ptr[ind++] = ceil(outval);
       }
     }
diff --git a/lite/tests/cv/image_convert_test.cc b/lite/tests/cv/image_convert_test.cc
index e22e327e8b10d1237f5e07b5b0a8d95d3b19e70b..b1302f3396fa17471d4252e27897ec44c0110342 100644
--- a/lite/tests/cv/image_convert_test.cc
+++ b/lite/tests/cv/image_convert_test.cc
@@ -559,48 +559,722 @@ void test_img(const std::vector<int>& cluster_id,
   }
 }
 
-#if 0
+void test_rotate(const std::vector<int>& cluster_id,
+                 const std::vector<int>& thread_num,
+                 int srcw,
+                 int srch,
+                 int dstw,
+                 int dsth,
+                 ImageFormat srcFormat,
+                 ImageFormat dstFormat,
+                 float rotate,
+                 FlipParam flip,
+                 LayoutType layout,
+                 int test_iter = 10) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+  for (auto& cls : cluster_id) {
+    for (auto& th : thread_num) {
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      LOG(INFO) << "cluster: " << cls << ", threads: " << th;
+
+      LOG(INFO) << " input tensor size, num= " << 1 << ", channel= " << 1
+                << ", height= " << srch << ", width= " << srcw
+                << ", srcFormat= " << (ImageFormat)srcFormat;
+      if (srcFormat == ImageFormat::NV21) {
+        LOG(INFO) << "srcFormat: NV21";
+      }
+      if (srcFormat == ImageFormat::NV12) {
+        LOG(INFO) << "srcFormat: NV12";
+      }
+      if (srcFormat == ImageFormat::GRAY) {
+        LOG(INFO) << "srcFormat: GRAY";
+      }
+      if (srcFormat == ImageFormat::BGRA) {
+        LOG(INFO) << "srcFormat: BGRA";
+      }
+      if (srcFormat == ImageFormat::BGR) {
+        LOG(INFO) << "srcFormat: BGR";
+      }
+      if (srcFormat == ImageFormat::RGBA) {
+        LOG(INFO) << "srcFormat: RGBA";
+      }
+      if (srcFormat == ImageFormat::RGB) {
+        LOG(INFO) << "srcFormat: RGB";
+      }
+      LOG(INFO) << " output tensor size, num=" << 1 << ", channel=" << 1
+                << ", height=" << dsth << ", width=" << dstw
+                << ", dstFormat= " << (ImageFormat)dstFormat;
+
+      if (dstFormat == ImageFormat::NV21) {
+        LOG(INFO) << "dstFormat: NV21";
+      }
+      if (dstFormat == ImageFormat::NV12) {
+        LOG(INFO) << "dstFormat: NV12";
+      }
+      if (dstFormat == ImageFormat::GRAY) {
+        LOG(INFO) << "dstFormat: GRAY";
+      }
+      if (dstFormat == ImageFormat::BGRA) {
+        LOG(INFO) << "dstFormat: BGRA";
+      }
+      if (dstFormat == ImageFormat::BGR) {
+        LOG(INFO) << "dstFormat: BGR";
+      }
+      if (dstFormat == ImageFormat::RGBA) {
+        LOG(INFO) << "dstFormat: RGBA";
+      }
+      if (dstFormat == ImageFormat::RGB) {
+        LOG(INFO) << "dstFormat: RGB";
+      }
+
+      LOG(INFO) << "Rotate = " << rotate << ", Flip = " << flip
+                << ", Layout = " << static_cast<int>(layout);
+
+      int size = 3 * srch * srcw;
+      if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+        size = ceil(1.5 * srch) * srcw;
+      } else if (srcFormat == ImageFormat::BGRA ||
+                 srcFormat == ImageFormat::RGBA) {
+        size = 4 * srch * srcw;
+      } else if (srcFormat == ImageFormat::GRAY) {
+        size = srch * srcw;
+      }
+      uint8_t* src = new uint8_t[size];
+      fill_tensor_host_rand(src, size);
+
+      int out_size = srch * srcw;
+      if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
+        out_size = ceil(1.5 * srch) * srcw;
+      } else if (dstFormat == ImageFormat::BGR ||
+                 dstFormat == ImageFormat::RGB) {
+        out_size = 3 * srch * srcw;
+      } else if (dstFormat == ImageFormat::BGRA ||
+                 dstFormat == ImageFormat::RGBA) {
+        out_size = 4 * srch * srcw;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        out_size = srch * srcw;
+      }
+      uint8_t* basic_dst = new uint8_t[out_size];
+      uint8_t* lite_dst = new uint8_t[out_size];
+      if (FLAGS_check_result) {
+        image_rotate_basic(
+            src, basic_dst, (ImageFormat)dstFormat, srcw, srch, rotate);
+      }
+      Timer t_rotate;
+
+      LOG(INFO) << "saber cv compute";
+      TransParam tparam;
+      tparam.ih = srch;
+      tparam.iw = srcw;
+      tparam.oh = srch;
+      tparam.ow = srcw;
+      tparam.flip_param = flip;
+      tparam.rotate_param = rotate;
+
+      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+
+      for (int i = 0; i < test_iter; ++i) {
+        t_rotate.Start();
+        image_preprocess.imageRotate(src, lite_dst);
+        t_rotate.Stop();
+      }
+      LOG(INFO) << "image rotate avg time : " << t_rotate.LapTimes().Avg()
+                << ", min time: " << t_rotate.LapTimes().Min()
+                << ", max time: " << t_rotate.LapTimes().Max();
+
+      double max_ratio = 0;
+      double max_diff = 0;
+      const double eps = 1e-6f;
+      if (FLAGS_check_result) {
+        LOG(INFO) << "diff, image rotate size: " << out_size;
+        uint8_t* diff_v = new uint8_t[out_size];
+        for (int i = 0; i < out_size; i++) {
+          uint8_t a = lite_dst[i];
+          uint8_t b = basic_dst[i];
+          uint8_t diff1 = a - b;
+          uint8_t diff = diff1 > 0 ? diff1 : -diff1;
+          diff_v[i] = diff;
+          if (max_diff < diff) {
+            max_diff = diff;
+            max_ratio = 2.0 * max_diff / (a + b + eps);
+          }
+        }
+        if (std::abs(max_ratio) >= 1e-5f) {
+          int width = size / srch;
+          printf("din: %d \n", width);
+          print_int8(src, size, width);
+          width = srch * 3;
+          printf("saber result: %d\n", width);
+          print_int8(lite_dst, out_size, width);
+          printf("basic result: \n");
+          print_int8(basic_dst, out_size, width);
+          printf("diff result: \n");
+          print_int8(diff_v, out_size, width);
+        }
+        delete[] diff_v;
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        bool rst = std::abs(max_ratio) < 1e-5f;
+        CHECK_EQ(rst, true) << "compute result error";
+      }
+      LOG(INFO) << "image rotate end";
+    }
+  }
+}
+void test_flip(const std::vector<int>& cluster_id,
+               const std::vector<int>& thread_num,
+               int srcw,
+               int srch,
+               int dstw,
+               int dsth,
+               ImageFormat srcFormat,
+               ImageFormat dstFormat,
+               float rotate,
+               FlipParam flip,
+               LayoutType layout,
+               int test_iter = 10) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+  for (auto& cls : cluster_id) {
+    for (auto& th : thread_num) {
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      LOG(INFO) << "cluster: " << cls << ", threads: " << th;
+
+      LOG(INFO) << " input tensor size, num= " << 1 << ", channel= " << 1
+                << ", height= " << srch << ", width= " << srcw
+                << ", srcFormat= " << (ImageFormat)srcFormat;
+      if (srcFormat == ImageFormat::NV21) {
+        LOG(INFO) << "srcFormat: NV21";
+      }
+      if (srcFormat == ImageFormat::NV12) {
+        LOG(INFO) << "srcFormat: NV12";
+      }
+      if (srcFormat == ImageFormat::GRAY) {
+        LOG(INFO) << "srcFormat: GRAY";
+      }
+      if (srcFormat == ImageFormat::BGRA) {
+        LOG(INFO) << "srcFormat: BGRA";
+      }
+      if (srcFormat == ImageFormat::BGR) {
+        LOG(INFO) << "srcFormat: BGR";
+      }
+      if (srcFormat == ImageFormat::RGBA) {
+        LOG(INFO) << "srcFormat: RGBA";
+      }
+      if (srcFormat == ImageFormat::RGB) {
+        LOG(INFO) << "srcFormat: RGB";
+      }
+      LOG(INFO) << " output tensor size, num=" << 1 << ", channel=" << 1
+                << ", height=" << dsth << ", width=" << dstw
+                << ", dstFormat= " << (ImageFormat)dstFormat;
+
+      if (dstFormat == ImageFormat::NV21) {
+        LOG(INFO) << "dstFormat: NV21";
+      }
+      if (dstFormat == ImageFormat::NV12) {
+        LOG(INFO) << "dstFormat: NV12";
+      }
+      if (dstFormat == ImageFormat::GRAY) {
+        LOG(INFO) << "dstFormat: GRAY";
+      }
+      if (dstFormat == ImageFormat::BGRA) {
+        LOG(INFO) << "dstFormat: BGRA";
+      }
+      if (dstFormat == ImageFormat::BGR) {
+        LOG(INFO) << "dstFormat: BGR";
+      }
+      if (dstFormat == ImageFormat::RGBA) {
+        LOG(INFO) << "dstFormat: RGBA";
+      }
+      if (dstFormat == ImageFormat::RGB) {
+        LOG(INFO) << "dstFormat: RGB";
+      }
+
+      LOG(INFO) << "Rotate = " << rotate << ", Flip = " << flip
+                << ", Layout = " << static_cast<int>(layout);
+
+      int size = 3 * srch * srcw;
+      if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+        size = ceil(1.5 * srch) * srcw;
+      } else if (srcFormat == ImageFormat::BGRA ||
+                 srcFormat == ImageFormat::RGBA) {
+        size = 4 * srch * srcw;
+      } else if (srcFormat == ImageFormat::GRAY) {
+        size = srch * srcw;
+      }
+      uint8_t* src = new uint8_t[size];
+      fill_tensor_host_rand(src, size);
+
+      int out_size = srch * srcw;
+      if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
+        out_size = ceil(1.5 * srch) * srcw;
+      } else if (dstFormat == ImageFormat::BGR ||
+                 dstFormat == ImageFormat::RGB) {
+        out_size = 3 * srch * srcw;
+      } else if (dstFormat == ImageFormat::BGRA ||
+                 dstFormat == ImageFormat::RGBA) {
+        out_size = 4 * srch * srcw;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        out_size = srch * srcw;
+      }
+      uint8_t* basic_dst = new uint8_t[out_size];
+      uint8_t* lite_dst = new uint8_t[out_size];
+      if (FLAGS_check_result) {
+        image_flip_basic(
+            src, basic_dst, (ImageFormat)dstFormat, srcw, srch, flip);
+      }
+      Timer t_rotate;
+
+      LOG(INFO) << "saber cv compute";
+      TransParam tparam;
+      tparam.ih = srch;
+      tparam.iw = srcw;
+      tparam.oh = srch;
+      tparam.ow = srcw;
+      tparam.flip_param = flip;
+      tparam.rotate_param = rotate;
+
+      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+
+      for (int i = 0; i < test_iter; ++i) {
+        t_rotate.Start();
+        image_preprocess.imageFlip(src, lite_dst);
+        t_rotate.Stop();
+      }
+      LOG(INFO) << "image flip avg time : " << t_rotate.LapTimes().Avg()
+                << ", min time: " << t_rotate.LapTimes().Min()
+                << ", max time: " << t_rotate.LapTimes().Max();
+
+      double max_ratio = 0;
+      double max_diff = 0;
+      const double eps = 1e-6f;
+      if (FLAGS_check_result) {
+        LOG(INFO) << "diff, image flip size: " << out_size;
+        uint8_t* diff_v = new uint8_t[out_size];
+        for (int i = 0; i < out_size; i++) {
+          uint8_t a = lite_dst[i];
+          uint8_t b = basic_dst[i];
+          uint8_t diff1 = a - b;
+          uint8_t diff = diff1 > 0 ? diff1 : -diff1;
+          diff_v[i] = diff;
+          if (max_diff < diff) {
+            max_diff = diff;
+            max_ratio = 2.0 * max_diff / (a + b + eps);
+          }
+        }
+        if (std::abs(max_ratio) >= 1e-5f) {
+          int width = size / srch;
+          printf("din: \n");
+          print_int8(src, size, width);
+          width = out_size / srch;
+          printf("saber result: \n");
+          print_int8(lite_dst, out_size, width);
+          printf("basic result: \n");
+          print_int8(basic_dst, out_size, width);
+          printf("diff result: \n");
+          print_int8(diff_v, out_size, width);
+        }
+        delete[] diff_v;
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        bool rst = std::abs(max_ratio) < 1e-5f;
+        CHECK_EQ(rst, true) << "compute result error";
+      }
+      LOG(INFO) << "image flip end";
+    }
+  }
+}
+void test_resize(const std::vector<int>& cluster_id,
+                 const std::vector<int>& thread_num,
+                 int srcw,
+                 int srch,
+                 int dstw,
+                 int dsth,
+                 ImageFormat srcFormat,
+                 ImageFormat dstFormat,
+                 float rotate,
+                 FlipParam flip,
+                 LayoutType layout,
+                 int test_iter = 10) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+  test_iter = 1;
+  for (auto& cls : cluster_id) {
+    for (auto& th : thread_num) {
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      LOG(INFO) << "cluster: " << cls << ", threads: " << th;
+
+      LOG(INFO) << " input tensor size, num= " << 1 << ", channel= " << 1
+                << ", height= " << srch << ", width= " << srcw
+                << ", srcFormat= " << (ImageFormat)srcFormat;
+      if (srcFormat == ImageFormat::NV21) {
+        LOG(INFO) << "srcFormat: NV21";
+      }
+      if (srcFormat == ImageFormat::NV12) {
+        LOG(INFO) << "srcFormat: NV12";
+      }
+      if (srcFormat == ImageFormat::GRAY) {
+        LOG(INFO) << "srcFormat: GRAY";
+      }
+      if (srcFormat == ImageFormat::BGRA) {
+        LOG(INFO) << "srcFormat: BGRA";
+      }
+      if (srcFormat == ImageFormat::BGR) {
+        LOG(INFO) << "srcFormat: BGR";
+      }
+      if (srcFormat == ImageFormat::RGBA) {
+        LOG(INFO) << "srcFormat: RGBA";
+      }
+      if (srcFormat == ImageFormat::RGB) {
+        LOG(INFO) << "srcFormat: RGB";
+      }
+      LOG(INFO) << " output tensor size, num=" << 1 << ", channel=" << 1
+                << ", height=" << dsth << ", width=" << dstw
+                << ", dstFormat= " << (ImageFormat)dstFormat;
+
+      if (dstFormat == ImageFormat::NV21) {
+        LOG(INFO) << "dstFormat: NV21";
+      }
+      if (dstFormat == ImageFormat::NV12) {
+        LOG(INFO) << "dstFormat: NV12";
+      }
+      if (dstFormat == ImageFormat::GRAY) {
+        LOG(INFO) << "dstFormat: GRAY";
+      }
+      if (dstFormat == ImageFormat::BGRA) {
+        LOG(INFO) << "dstFormat: BGRA";
+      }
+      if (dstFormat == ImageFormat::BGR) {
+        LOG(INFO) << "dstFormat: BGR";
+      }
+      if (dstFormat == ImageFormat::RGBA) {
+        LOG(INFO) << "dstFormat: RGBA";
+      }
+      if (dstFormat == ImageFormat::RGB) {
+        LOG(INFO) << "dstFormat: RGB";
+      }
+
+      LOG(INFO) << "Rotate = " << rotate << ", Flip = " << flip
+                << ", Layout = " << static_cast<int>(layout);
+
+      int size = 3 * srch * srcw;
+      if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+        size = ceil(1.5 * srch) * srcw;
+      } else if (srcFormat == ImageFormat::BGRA ||
+                 srcFormat == ImageFormat::RGBA) {
+        size = 4 * srch * srcw;
+      } else if (srcFormat == ImageFormat::GRAY) {
+        size = srch * srcw;
+      }
+      uint8_t* src = new uint8_t[size];
+      fill_tensor_host_rand(src, size);
+
+      int out_size = dsth * dstw;
+      if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
+        out_size = ceil(1.5 * dsth) * dstw;
+      } else if (dstFormat == ImageFormat::BGR ||
+                 dstFormat == ImageFormat::RGB) {
+        out_size = 3 * dsth * dstw;
+      } else if (dstFormat == ImageFormat::BGRA ||
+                 dstFormat == ImageFormat::RGBA) {
+        out_size = 4 * dsth * dstw;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        out_size = dsth * dstw;
+      }
+      uint8_t* basic_dst = new uint8_t[out_size];
+      uint8_t* lite_dst = new uint8_t[out_size];
+      if (FLAGS_check_result) {
+        LOG(INFO) << "image_resize_basic";
+        image_resize_basic(
+            src, basic_dst, (ImageFormat)dstFormat, srcw, srch, dstw, dsth);
+      }
+      Timer t_rotate;
+
+      LOG(INFO) << "saber cv compute";
+      TransParam tparam;
+      tparam.ih = srch;
+      tparam.iw = srcw;
+      tparam.oh = dsth;
+      tparam.ow = dstw;
+      tparam.flip_param = flip;
+      tparam.rotate_param = rotate;
+
+      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+
+      for (int i = 0; i < test_iter; ++i) {
+        t_rotate.Start();
+        image_preprocess.imageResize(src, lite_dst);
+        t_rotate.Stop();
+      }
+      LOG(INFO) << "image Resize avg time : " << t_rotate.LapTimes().Avg()
+                << ", min time: " << t_rotate.LapTimes().Min()
+                << ", max time: " << t_rotate.LapTimes().Max();
+
+      double max_ratio = 0;
+      double max_diff = 0;
+      const double eps = 1e-6f;
+      if (FLAGS_check_result) {
+        LOG(INFO) << "diff, image Resize size: " << out_size;
+        int* diff_v = new int[out_size];
+        for (int i = 0; i < out_size; i++) {
+          uint8_t a = lite_dst[i];
+          uint8_t b = basic_dst[i];
+          int diff1 = a - b;  // basic resize and saber resize 在float ->
+          // int转换时存在误差，误差范围是{-1, 1}
+          int diff = 0;
+          if (diff1 < -1 || diff1 > 1) diff = diff1 < 0 ? -diff1 : diff1;
+          diff_v[i] = diff;
+          if (diff > 1 && max_diff < diff) {
+            max_diff = diff;
+            printf("i: %d, lite: %d, basic: %d \n", i, a, b);
+            max_ratio = 2.0 * max_diff / (a + b + eps);
+          }
+        }
+        if (std::abs(max_ratio) >= 1e-5f) {
+          int width = size / srcw;
+          printf("din: \n");
+          print_int8(src, size, width);
+          width = out_size / dstw;
+          printf("saber result: \n");
+          print_int8(lite_dst, out_size, width);
+          printf("basic result: \n");
+          print_int8(basic_dst, out_size, width);
+          printf("diff result: \n");
+          print_int(diff_v, out_size, width);
+        }
+        delete[] diff_v;
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        bool rst = std::abs(max_ratio) < 1e-5f;
+        CHECK_EQ(rst, true) << "compute result error";
+      }
+      LOG(INFO) << "image Resize end";
+    }
+  }
+}
+void test_convert(const std::vector<int>& cluster_id,
+                  const std::vector<int>& thread_num,
+                  int srcw,
+                  int srch,
+                  int dstw,
+                  int dsth,
+                  ImageFormat srcFormat,
+                  ImageFormat dstFormat,
+                  float rotate,
+                  FlipParam flip,
+                  LayoutType layout,
+                  int test_iter = 10) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+  for (auto& cls : cluster_id) {
+    for (auto& th : thread_num) {
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      LOG(INFO) << "cluster: " << cls << ", threads: " << th;
+
+      LOG(INFO) << " input tensor size, num= " << 1 << ", channel= " << 1
+                << ", height= " << srch << ", width= " << srcw
+                << ", srcFormat= " << (ImageFormat)srcFormat;
+      if (srcFormat == ImageFormat::NV21) {
+        LOG(INFO) << "srcFormat: NV21";
+      }
+      if (srcFormat == ImageFormat::NV12) {
+        LOG(INFO) << "srcFormat: NV12";
+      }
+      if (srcFormat == ImageFormat::GRAY) {
+        LOG(INFO) << "srcFormat: GRAY";
+      }
+      if (srcFormat == ImageFormat::BGRA) {
+        LOG(INFO) << "srcFormat: BGRA";
+      }
+      if (srcFormat == ImageFormat::BGR) {
+        LOG(INFO) << "srcFormat: BGR";
+      }
+      if (srcFormat == ImageFormat::RGBA) {
+        LOG(INFO) << "srcFormat: RGBA";
+      }
+      if (srcFormat == ImageFormat::RGB) {
+        LOG(INFO) << "srcFormat: RGB";
+      }
+      LOG(INFO) << " output tensor size, num=" << 1 << ", channel=" << 1
+                << ", height=" << dsth << ", width=" << dstw
+                << ", dstFormat= " << (ImageFormat)dstFormat;
+
+      if (dstFormat == ImageFormat::NV21) {
+        LOG(INFO) << "dstFormat: NV21";
+      }
+      if (dstFormat == ImageFormat::NV12) {
+        LOG(INFO) << "dstFormat: NV12";
+      }
+      if (dstFormat == ImageFormat::GRAY) {
+        LOG(INFO) << "dstFormat: GRAY";
+      }
+      if (dstFormat == ImageFormat::BGRA) {
+        LOG(INFO) << "dstFormat: BGRA";
+      }
+      if (dstFormat == ImageFormat::BGR) {
+        LOG(INFO) << "dstFormat: BGR";
+      }
+      if (dstFormat == ImageFormat::RGBA) {
+        LOG(INFO) << "dstFormat: RGBA";
+      }
+      if (dstFormat == ImageFormat::RGB) {
+        LOG(INFO) << "dstFormat: RGB";
+      }
+
+      LOG(INFO) << "Rotate = " << rotate << ", Flip = " << flip
+                << ", Layout = " << static_cast<int>(layout);
+
+      int size = 3 * srch * srcw;
+      if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+        size = ceil(1.5 * srch) * srcw;
+      } else if (srcFormat == ImageFormat::BGRA ||
+                 srcFormat == ImageFormat::RGBA) {
+        size = 4 * srch * srcw;
+      } else if (srcFormat == ImageFormat::GRAY) {
+        size = srch * srcw;
+      }
+      uint8_t* src = new uint8_t[size];
+      fill_tensor_host_rand(src, size);
+
+      int out_size = srch * srcw;
+      if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
+        out_size = ceil(1.5 * srch) * srcw;
+      } else if (dstFormat == ImageFormat::BGR ||
+                 dstFormat == ImageFormat::RGB) {
+        out_size = 3 * srch * srcw;
+      } else if (dstFormat == ImageFormat::BGRA ||
+                 dstFormat == ImageFormat::RGBA) {
+        out_size = 4 * srch * srcw;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        out_size = srch * srcw;
+      }
+      uint8_t* basic_dst = new uint8_t[out_size];
+      uint8_t* lite_dst = new uint8_t[out_size];
+      if (FLAGS_check_result) {
+        image_convert_basic(src,
+                            basic_dst,
+                            (ImageFormat)srcFormat,
+                            (ImageFormat)dstFormat,
+                            srcw,
+                            srch,
+                            out_size);
+      }
+      Timer t_rotate;
+
+      LOG(INFO) << "saber cv compute";
+      TransParam tparam;
+      tparam.ih = srch;
+      tparam.iw = srcw;
+      tparam.oh = srch;
+      tparam.ow = srcw;
+      tparam.flip_param = flip;
+      tparam.rotate_param = rotate;
+
+      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+
+      for (int i = 0; i < test_iter; ++i) {
+        t_rotate.Start();
+        image_preprocess.imageConvert(src, lite_dst);
+        t_rotate.Stop();
+      }
+      LOG(INFO) << "image Convert avg time : " << t_rotate.LapTimes().Avg()
+                << ", min time: " << t_rotate.LapTimes().Min()
+                << ", max time: " << t_rotate.LapTimes().Max();
+
+      double max_ratio = 0;
+      double max_diff = 0;
+      const double eps = 1e-6f;
+      if (FLAGS_check_result) {
+        LOG(INFO) << "diff, image convert size: " << out_size;
+        uint8_t* diff_v = new uint8_t[out_size];
+        for (int i = 0; i < out_size; i++) {
+          uint8_t a = lite_dst[i];
+          uint8_t b = basic_dst[i];
+          uint8_t diff1 = a - b;
+          uint8_t diff = diff1 > 0 ? diff1 : -diff1;
+          diff_v[i] = diff;
+          if (max_diff < diff) {
+            max_diff = diff;
+            max_ratio = 2.0 * max_diff / (a + b + eps);
+          }
+        }
+        if (std::abs(max_ratio) >= 1e-5f) {
+          int width = size / srch;
+          printf("din: \n");
+          print_int8(src, size, width);
+          width = out_size / srch;
+          printf("saber result: \n");
+          print_int8(lite_dst, out_size, width);
+          printf("basic result: \n");
+          print_int8(basic_dst, out_size, width);
+          printf("diff result: \n");
+          print_int8(diff_v, out_size, width);
+        }
+        delete[] diff_v;
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        bool rst = std::abs(max_ratio) < 1e-5f;
+        CHECK_EQ(rst, true) << "compute result error";
+      }
+      LOG(INFO) << "image convert end";
+    }
+  }
+}
+
+#if 1
 TEST(TestImageConvertRand, test_func_image_convert_preprocess) {
   if (FLAGS_basic_test) {
     for (auto w : {1, 4, 8, 16, 112, 224, 1092}) {
       for (auto h : {1, 4, 16, 112, 224}) {
-        for (auto ww : {66}) {
-          for (auto hh : {12}) {
-            for (auto rotate : {180}) {
-              for (auto flip : {0}) {
-                for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) {
-                  for (auto dstFormat : {0, 1, 2, 3, 4}) {
-                    for (auto layout : {1}) {
-                      if ((srcFormat == ImageFormat::NV12 ||
-                           srcFormat == ImageFormat::NV21) &&
-                              (dstFormat == ImageFormat::GRAY)) {
-                        continue;
-                      }
-                      if ((dstFormat == ImageFormat::NV12 ||
-                           dstFormat == ImageFormat::NV21) &&
-                              (srcFormat == ImageFormat::GRAY)) {
-                        continue;
-                      }
-                      if (srcFormat == ImageFormat::NV12 ||
-                          srcFormat == ImageFormat::NV21) {
-                        if (w % 2) {  // is not ou shu, two line y == one line
-                                      // uv
-                          continue;
-                        }
-                      }
-                      test_img({FLAGS_cluster},
+        for (auto rotate : {180}) {
+          for (auto flip : {0}) {
+            for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) {
+              for (auto dstFormat : {0, 1, 2, 3, 4}) {
+                for (auto layout : {1}) {
+                  if ((srcFormat == ImageFormat::NV12 ||
+                       srcFormat == ImageFormat::NV21) &&
+                      (dstFormat == ImageFormat::GRAY)) {
+                    continue;
+                  }
+                  if ((dstFormat == ImageFormat::NV12 ||
+                       dstFormat == ImageFormat::NV21) &&
+                      (srcFormat == ImageFormat::GRAY)) {
+                    continue;
+                  }
+                  if (srcFormat == ImageFormat::NV12 ||
+                      srcFormat == ImageFormat::NV21) {
+                    if (w % 2) {
+                      continue;
+                    }
+                  }
+                  test_convert({FLAGS_cluster},
                                {1},
                                w,
                                h,
-                               ww,
-                               hh,
+                               w,
+                               h,
                                (ImageFormat)srcFormat,
                                (ImageFormat)dstFormat,
                                rotate,
                                (FlipParam)flip,
                                (LayoutType)layout);
-                    }
-                  }
                 }
               }
             }
@@ -611,44 +1285,35 @@ TEST(TestImageConvertRand, test_func_image_convert_preprocess) {
   }
 }
 #endif
-#if 0
+#if 1
 TEST(TestImageConvertRand, test_func_image_resize_preprocess) {
   if (FLAGS_basic_test) {
-    for (auto w : {1, 4, 8, 16, 112, 224, 1092}) {
-      for (auto h : {1, 4, 16, 112, 224}) {
-        for (auto ww : {1, 2, 8, 32, 112}) {
-          for (auto hh : {1, 2, 8, 112}) {
+    for (auto w : {8, 16, 112, 224, 1092}) {
+      for (auto h : {4, 16, 112, 224}) {
+        for (auto ww : {8, 32, 112}) {
+          for (auto hh : {8, 112}) {
             for (auto rotate : {180}) {
               for (auto flip : {0}) {
                 for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) {
-                  for (auto dstFormat : {0, 1, 2, 3, 4, 11}) {
-                    for (auto layout : {1}) {
-                      if (dstFormat == ImageFormat::NV12 ||
-                           dstFormat == ImageFormat::NV21 ||
-                          (srcFormat == ImageFormat::NV12 ||
-                           srcFormat == ImageFormat::NV21) &&
-                              dstFormat == ImageFormat::GRAY) {
+                  for (auto layout : {1}) {
+                    auto dstFormat = srcFormat;
+                    if (srcFormat == ImageFormat::NV12 ||
+                        srcFormat == ImageFormat::NV21) {
+                      if (w % 2) {
                         continue;
                       }
-                      if (srcFormat == ImageFormat::NV12 ||
-                          srcFormat == ImageFormat::NV21) {
-                        if (w % 2) {  // is not ou shu, two line y == one line
-                                      // uv
-                          continue;
-                        }
-                      }
-                      test_img({FLAGS_cluster},
-                               {1, 2, 4},
-                               w,
-                               h,
-                               ww,
-                               hh,
-                               (ImageFormat)srcFormat,
-                               (ImageFormat)dstFormat,
-                               rotate,
-                               (FlipParam)flip,
-                               (LayoutType)layout);
                     }
+                    test_resize({FLAGS_cluster},
+                                {1, 2, 4},
+                                w,
+                                h,
+                                ww,
+                                hh,
+                                (ImageFormat)srcFormat,
+                                (ImageFormat)dstFormat,
+                                rotate,
+                                (FlipParam)flip,
+                                (LayoutType)layout);
                   }
                 }
               }
@@ -665,34 +1330,40 @@ TEST(TestImageConvertRand, test_func_image_trans_preprocess) {
   if (FLAGS_basic_test) {
     for (auto w : {1, 8, 16, 112, 224, 1092}) {
       for (auto h : {1, 16, 112, 224}) {
-        for (auto ww : {32, 112}) {
-          for (auto hh : {112}) {
-            for (auto rotate : {90, 180, 270}) {
-              for (auto flip : {-1, 0, 1}) {
-                for (auto srcFormat : {0}) {
-                  for (auto dstFormat : {0, 1, 2, 3, 4}) {
-                    for (auto layout : {1, 3}) {
-                      if (srcFormat == ImageFormat::NV12 ||
-                          srcFormat == ImageFormat::NV21) {
-                        if (w % 2) {  // is not ou shu, two line y == one line
-                                      // uv
-                          continue;
-                        }
-                      }
-                      test_img({FLAGS_cluster},
-                               {1, 2, 4},
-                               w,
-                               h,
-                               ww,
-                               hh,
-                               (ImageFormat)srcFormat,
-                               (ImageFormat)dstFormat,
-                               rotate,
-                               (FlipParam)flip,
-                               (LayoutType)layout);
-                    }
+        for (auto rotate : {90, 180, 270}) {
+          for (auto flip : {-1, 0, 1}) {
+            for (auto srcFormat : {0, 1, 2, 3, 4}) {
+              for (auto layout : {1, 3}) {
+                auto dstFormat = srcFormat;
+                if (srcFormat == ImageFormat::NV12 ||
+                    srcFormat == ImageFormat::NV21) {
+                  if (w % 2) {
+                    continue;
                   }
                 }
+                test_flip({FLAGS_cluster},
+                          {1, 2, 4},
+                          w,
+                          h,
+                          w,
+                          h,
+                          (ImageFormat)srcFormat,
+                          (ImageFormat)dstFormat,
+                          rotate,
+                          (FlipParam)flip,
+                          (LayoutType)layout);
+
+                test_rotate({FLAGS_cluster},
+                            {1, 2, 4},
+                            w,
+                            h,
+                            w,
+                            h,
+                            (ImageFormat)srcFormat,
+                            (ImageFormat)dstFormat,
+                            rotate,
+                            (FlipParam)flip,
+                            (LayoutType)layout);
               }
             }
           }
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index c55f62c02977cec54b1ef679a038e06cb576b6b8..ffed48cdc612bd7d5c7e701b0e198390976b7bef 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -19,12 +19,12 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_
     lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_logical_xor_compute SRCS logical_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -34,10 +34,13 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_
     lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_fill_constant_compute SRCS fill_constant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_fill_constant_batch_size_like_compute SRCS fill_constant_batch_size_like_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 
 if(LITE_BUILD_EXTRA)
     lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS ${bm_kernels} arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_sequence_conv_compute SRCS sequence_conv_compute_test.cc DEPS ${bm_kernels} arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -55,7 +58,18 @@ if(LITE_BUILD_EXTRA)
     lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_lookup_table_dequant_compute SRCS lookup_table_dequant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+
+    # for training kernel
+    if (LITE_WITH_TRAIN)
+        lite_cc_test(test_kernel_mean_compute SRCS mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_activation_grad_compute SRCS activation_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_elementwise_grad_compute SRCS elementwise_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_mul_grad_compute SRCS mul_grad_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+        lite_cc_test(test_kernel_sgd_compute SRCS sgd_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    endif()
+
 endif()
     lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels}  ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -68,4 +82,5 @@ endif()
     lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_crf_decoding_compute SRCS crf_decoding_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
diff --git a/lite/tests/kernels/activation_compute_test.cc b/lite/tests/kernels/activation_compute_test.cc
index d049544a7ce198ab88dd588132d1e36c5c721a9b..5a0b033b1b8c4d8f28aa05c3f2fcac40f2569bf4 100644
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
@@ -35,7 +35,8 @@ enum activation_type_test {
   EXP,
   FLOOR,
   RSQRT,
-  GELU
+  GELU,
+  SQUARE
 };
 
 class ActivationComputeTester : public arena::TestCase {
@@ -192,6 +193,12 @@ class ActivationComputeTester : public arena::TestCase {
         }
         break;
       }
+      case SQUARE: {
+        for (int i = 0; i < dims_.production(); i++) {
+          output_data[i] = x_data[i] * x_data[i];
+        }
+        break;
+      }
       default:
         LOG(INFO) << "the type of activation is unknow.";
     }
@@ -264,25 +271,12 @@ TEST(Activation_relu, precision) {
   return;
 #endif
 
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-              place,
-              "def",
-              0.01,
-              6.,
-              "all",
-              0.,
-              DDim(std::vector<int64_t>({n, c, h, w})),
-              "relu",
-              RELU));
-          arena::Arena arena(std::move(tester), place, abs_error);
-          arena.TestPrecision();
-        }
-      }
-    }
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
+        place, "def", 0.01, 6., "all", 0., DDim(dims), "relu", RELU));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
   }
 }
 
@@ -299,26 +293,21 @@ TEST(Activation_leaky_relu, precision) {
   return;
 #endif
 
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          for (auto slope : {0.01, 0.1}) {
-            std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-                place,
-                "def",
-                slope,
-                6.,
-                "all",
-                0.,
-                DDim(std::vector<int64_t>({n, c, h, w})),
-                "leaky_relu",
-                LEAKY_RELU));
-            arena::Arena arena(std::move(tester), place, abs_error);
-            arena.TestPrecision();
-          }
-        }
-      }
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    for (auto slope : {0.01, 0.1}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new ActivationComputeTester(place,
+                                      "def",
+                                      slope,
+                                      6.,
+                                      "all",
+                                      0.,
+                                      DDim(dims),
+                                      "leaky_relu",
+                                      LEAKY_RELU));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
     }
   }
 }
@@ -336,26 +325,21 @@ TEST(Activation_relu_clipped, precision) {
   return;
 #endif
 
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          for (auto coef : {0.5, 6.}) {
-            std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-                place,
-                "def",
-                0.01,
-                coef,
-                "all",
-                0.,
-                DDim(std::vector<int64_t>({n, c, h, w})),
-                "relu_clipped",
-                RELU_CLIPPED));
-            arena::Arena arena(std::move(tester), place, abs_error);
-            arena.TestPrecision();
-          }
-        }
-      }
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    for (auto coef : {0.5, 6.}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new ActivationComputeTester(place,
+                                      "def",
+                                      0.01,
+                                      coef,
+                                      "all",
+                                      0.,
+                                      DDim(dims),
+                                      "relu_clipped",
+                                      RELU_CLIPPED));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
     }
   }
 }
@@ -365,26 +349,12 @@ TEST(Activation_prelu, precision) {
 #ifdef LITE_WITH_ARM
   Place place(TARGET(kARM));
 
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          for (auto mode : {"all", "channel", "element"}) {
-            std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-                place,
-                "def",
-                0.01,
-                6,
-                mode,
-                0.,
-                DDim(std::vector<int64_t>({n, c, h, w})),
-                "prelu",
-                PRELU));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
+  for (auto dims : std::vector<std::vector<int64_t>>{{1, 3, 2, 4}}) {
+    for (auto mode : {"all", "channel", "element"}) {
+      std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
+          place, "def", 0.01, 6, mode, 0., DDim(dims), "prelu", PRELU));
+      arena::Arena arena(std::move(tester), place, 2e-5);
+      arena.TestPrecision();
     }
   }
 #endif
@@ -403,25 +373,12 @@ TEST(Activation_sigmoid, precision) {
   return;
 #endif
 
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-              place,
-              "def",
-              0.01,
-              6.,
-              "all",
-              0.,
-              DDim(std::vector<int64_t>({n, c, h, w})),
-              "sigmoid",
-              SIGMOID));
-          arena::Arena arena(std::move(tester), place, abs_error);
-          arena.TestPrecision();
-        }
-      }
-    }
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
+        place, "def", 0.01, 6., "all", 0., DDim(dims), "sigmoid", SIGMOID));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
   }
 }
 
@@ -440,25 +397,12 @@ TEST(Activation_tanh, precision) {
   return;
 #endif
 
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-              place,
-              "def",
-              0.01,
-              6.,
-              "all",
-              0.,
-              DDim(std::vector<int64_t>({n, c, h, w})),
-              "tanh",
-              TANH));
-          arena::Arena arena(std::move(tester), place, abs_error);
-          arena.TestPrecision();
-        }
-      }
-    }
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
+        place, "def", 0.01, 6., "all", 0., DDim(dims), "tanh", TANH));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
   }
 }
 
@@ -467,26 +411,13 @@ TEST(Activation_swish, precision) {
 #ifdef LITE_WITH_ARM
   Place place(TARGET(kARM));
 
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          for (auto coef : {0.01, 0.1}) {
-            std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-                place,
-                "def",
-                0.01,
-                6,
-                "all",
-                coef,
-                DDim(std::vector<int64_t>({n, c, h, w})),
-                "swish",
-                SWISH));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    for (auto coef : {0.01, 0.1}) {
+      std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
+          place, "def", 0.01, 6, "all", coef, DDim(dims), "swish", SWISH));
+      arena::Arena arena(std::move(tester), place, 2e-5);
+      arena.TestPrecision();
     }
   }
 #endif
@@ -494,60 +425,46 @@ TEST(Activation_swish, precision) {
 
 TEST(Activation_relu6, precision) {
   LOG(INFO) << "test relu6 op...";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
 
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          for (auto slope : {0.01, 0.1}) {
-            std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-                place,
-                "def",
-                0.01,
-                6.,
-                "all",
-                0.,
-                DDim(std::vector<int64_t>({n, c, h, w})),
-                "relu6",
-                RELU6));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
-    }
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
+        place, "def", 0.01, 6., "all", 0., DDim(dims), "relu6", RELU6));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
   }
-#endif
 }
 
 TEST(Activation_log, precision) {
   LOG(INFO) << "test log op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
 
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-              place,
-              "def",
-              0.01,
-              6.,
-              "all",
-              0.,
-              DDim(std::vector<int64_t>({n, c, h, w})),
-              "log",
-              LOG));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
+        place, "def", 0.01, 6., "all", 0., DDim(dims), "log", LOG));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
   }
-#endif
 }
 
 TEST(Activation_exp, precision) {
@@ -555,25 +472,12 @@ TEST(Activation_exp, precision) {
 #ifdef LITE_WITH_ARM
   Place place(TARGET(kARM));
 
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-              place,
-              "def",
-              0.01,
-              6.,
-              "all",
-              0.,
-              DDim(std::vector<int64_t>({n, c, h, w})),
-              "exp",
-              EXP));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
+        place, "def", 0.01, 6., "all", 0., DDim(dims), "exp", EXP));
+    arena::Arena arena(std::move(tester), place, 2e-5);
+    arena.TestPrecision();
   }
 #endif
 }
@@ -582,26 +486,14 @@ TEST(Activation_floor, precision) {
   LOG(INFO) << "test floor op";
 #ifdef LITE_WITH_ARM
   Place place(TARGET(kARM));
-  for (auto n : {1, 3}) {
-    for (auto c : {3, 6}) {
-      for (auto h : {9, 18}) {
-        for (auto w : {9, 18}) {
-          std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-              place,
-              "def",
-              0.01,
-              6.,
-              "all",
-              0.,
-              DDim(std::vector<int64_t>({n, c, h, w})),
-              "floor",
-              FLOOR));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
+        place, "def", 0.01, 6., "all", 0., DDim(dims), "floor", FLOOR));
+    arena::Arena arena(std::move(tester), place, 2e-5);
+    arena.TestPrecision();
   }
+
 #endif
 }
 
@@ -609,29 +501,38 @@ TEST(Activation_rsqrt, precision) {
   LOG(INFO) << "test rsqrt op";
 #ifdef LITE_WITH_ARM
   Place place(TARGET(kARM));
-  for (auto n : {2}) {
-    for (auto c : {2}) {
-      for (auto h : {2}) {
-        for (auto w : {2}) {
-          std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-              place,
-              "def",
-              0.01,
-              6.,
-              "all",
-              0.,
-              DDim(std::vector<int64_t>({n, c, h, w})),
-              "rsqrt",
-              RSQRT));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
+        place, "def", 0.01, 6., "all", 0., DDim(dims), "rsqrt", RSQRT));
+    arena::Arena arena(std::move(tester), place, 2e-5);
+    arena.TestPrecision();
   }
 #endif
 }
 
+TEST(Activation_square, precision) {
+  LOG(INFO) << "test square op";
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
+
+  for (auto dims : std::vector<std::vector<int64_t>>{
+           {1, 3, 2, 4}, {2, 3, 4}, {5, 4}, {8}}) {
+    std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
+        place, "def", 0.01, 6., "all", 0., DDim(dims), "square", SQUARE));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
 TEST(Activation_gelu, precision) {
   LOG(INFO) << "test gelu op";
   Place place;
diff --git a/lite/tests/kernels/activation_grad_compute_test.cc b/lite/tests/kernels/activation_grad_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5d5046b01dee6c84f341159b68300197c20695e6
--- /dev/null
+++ b/lite/tests/kernels/activation_grad_compute_test.cc
@@ -0,0 +1,167 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/activation_grad_compute.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/arm/activation_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+using param_t = operators::ActivationParam;
+using grad_param_t = operators::ActivationGradParam;
+using kernel_t = SquareCompute;
+using grad_kernel_t = SquareGradCompute;
+
+class ActivationGradTester {
+ public:
+  explicit ActivationGradTester(DDim dims) : dims_(dims) {}
+
+  void prepare_kernel() {
+    std::unique_ptr<KernelContext> ctx1(new KernelContext);
+    ctx1->As<ARMContext>();
+    kernel_.SetContext(std::move(ctx1));
+
+    std::unique_ptr<KernelContext> ctx2(new KernelContext);
+    ctx2->As<ARMContext>();
+    delta_kernel_.SetContext(std::move(ctx2));
+
+    std::unique_ptr<KernelContext> ctx3(new KernelContext);
+    ctx3->As<ARMContext>();
+    grad_kernel_.SetContext(std::move(ctx3));
+  }
+
+  void run_forward(param_t* param,
+                   kernel_t* kernel,
+                   const std::vector<float>& in_vec,
+                   float* out_vec) {
+    Tensor x;
+    Tensor output;
+    x.Resize(dims_);
+    output.Resize(dims_);
+    auto* x_data = x.mutable_data<float>();
+    for (int i = 0; i < dims_.production(); i++) {
+      x_data[i] = in_vec[i];
+    }
+    param->X = &x;
+    param->Out = &output;
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* output_data = output.mutable_data<float>();
+    for (int i = 0; i < dims_.production(); i++) {
+      out_vec[i] = output_data[i];
+    }
+  }
+
+  void run_backward(grad_param_t* param,
+                    grad_kernel_t* kernel,
+                    const std::vector<float>& in_vec,
+                    const std::vector<float>& out_grad_vec,
+                    float* in_grad_vec) {
+    Tensor x;
+    Tensor x_grad;
+    Tensor out_grad;
+    x.Resize(dims_);
+    x_grad.Resize(dims_);
+    out_grad.Resize(dims_);
+    auto* x_data = x.mutable_data<float>();
+    auto* out_grad_data = out_grad.mutable_data<float>();
+
+    for (int i = 0; i < dims_.production(); i++) {
+      x_data[i] = in_vec[i];
+      out_grad_data[i] = out_grad_vec[i];
+    }
+    param->X = &x;
+    param->X_grad = &x_grad;
+    param->Out_grad = &out_grad;
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* x_grad_data = x_grad.mutable_data<float>();
+    for (int i = 0; i < dims_.production(); i++) {
+      in_grad_vec[i] = x_grad_data[i];
+    }
+  }
+
+  void check_grad(float delta, float max_grad_delta) {
+    std::vector<float> x(dims_.production());
+    std::vector<float> out(dims_.production());
+    for (int i = 0; i < dims_.production(); i++) {
+      x[i] = 1.0 * static_cast<float>(i % 128) * 0.3f - 1.1;
+    }
+    this->run_forward(&param_, &kernel_, x, out.data());
+
+    std::vector<float> x_delta(dims_.production());
+    std::vector<float> out_delta(dims_.production());
+
+    for (int i = 0; i < dims_.production(); i++) {
+      x_delta[i] = x[i] + delta;
+    }
+    this->run_forward(&delta_param_, &delta_kernel_, x_delta, out_delta.data());
+
+    std::vector<float> out_grad(dims_.production());
+    std::vector<float> x_grad(dims_.production());
+
+    for (int i = 0; i < dims_.production(); i++) {
+      out_grad[i] = 1.0;
+    }
+    this->run_backward(&grad_param_, &grad_kernel_, x, out_grad, x_grad.data());
+
+    for (int i = 0; i < dims_.production(); i++) {
+      EXPECT_NEAR(x_grad[i], (out_delta[i] - out[i]) / delta, max_grad_delta);
+    }
+  }
+
+ private:
+  DDim dims_;
+  kernel_t kernel_;
+  kernel_t delta_kernel_;
+  grad_kernel_t grad_kernel_;
+  param_t param_;
+  param_t delta_param_;
+  grad_param_t grad_param_;
+};
+
+void TestNormalCase(DDim dims) {
+  std::unique_ptr<ActivationGradTester> tester(new ActivationGradTester(dims));
+  tester->prepare_kernel();
+  float delta = 0.001;
+  float max_grad_delta = 0.005;
+  tester->check_grad(delta, max_grad_delta);
+}
+
+TEST(activation_grad_arm, compute) {
+  LOG(INFO) << "Test Square grad";
+  DeviceInfo::Init();
+  for (auto n : {2}) {
+    for (auto c : {2}) {
+      for (auto h : {2}) {
+        for (auto w : {2}) {
+          TestNormalCase(DDim(std::vector<int64_t>({n, c, h, w})));
+        }
+      }
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(square, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(square_grad, kARM, kFloat, kNCHW, def);
diff --git a/lite/tests/kernels/argmax_compute_test.cc b/lite/tests/kernels/argmax_compute_test.cc
index 9163e4bdaf5ab1da71b565dbd435b1a31ea9dcce..fc9985a5374af88637f1d6556ff41eae4817b522 100644
--- a/lite/tests/kernels/argmax_compute_test.cc
+++ b/lite/tests/kernels/argmax_compute_test.cc
@@ -48,7 +48,7 @@ class ArgmaxComputeTester : public arena::TestCase {
     output_shape.erase(output_shape.begin() + axis_);
     DDim output_dims(output_shape);
     out->Resize(output_dims);
-    auto* output_data = out->mutable_data<float>();
+    auto* output_data = out->mutable_data<int64_t>();
 
     auto* x = scope->FindTensor(input_);
     const auto* x_data = x->data<float>();
@@ -75,7 +75,7 @@ class ArgmaxComputeTester : public arena::TestCase {
                           std::greater<std::pair<float, int>>());
 
         // out
-        float* out_ptr = output_data + n * out_channel + k;
+        auto* out_ptr = output_data + n * out_channel + k;
         *out_ptr = vec[0].second;
       }
     }
diff --git a/lite/tests/kernels/assign_compute_test.cc b/lite/tests/kernels/assign_compute_test.cc
index 92f885f8dac381115aa4d1e023a2e75ae0da0503..d757b906083f1ae63ea94ea5e092f1eb3e77a732 100644
--- a/lite/tests/kernels/assign_compute_test.cc
+++ b/lite/tests/kernels/assign_compute_test.cc
@@ -67,13 +67,14 @@ void TestAssign(const Place& place) {
 }
 
 TEST(Assign, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
+  Place place;
 #ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  TestAssign(place);
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  TestAssign(place);
 }
 
 }  // namespace lite
diff --git a/lite/tests/kernels/assign_value_compute_test.cc b/lite/tests/kernels/assign_value_compute_test.cc
index 96959e507d21b52a56dddfa45eaf7e773f770967..10ad05f93ab161d24d006046924fd379d04069ff 100644
--- a/lite/tests/kernels/assign_value_compute_test.cc
+++ b/lite/tests/kernels/assign_value_compute_test.cc
@@ -95,10 +95,12 @@ class AssignValueComputeTester : public arena::TestCase {
 };
 
 TEST(AssignValue, precision) {
-  LOG(INFO) << "test argmax op";
+  Place place;
 #ifdef LITE_WITH_ARM
-  LOG(INFO) << "test argmax arm";
-  Place place(TARGET(kARM));
+  place = TARGET(kARM);
+#else
+  return;
+#endif
 
   for (int dtype : {2, 5}) {
     for (int n : {1}) {
@@ -114,7 +116,6 @@ TEST(AssignValue, precision) {
       }
     }
   }
-#endif
 }
 
 }  // namespace lite
diff --git a/lite/tests/kernels/cast_compute_test.cc b/lite/tests/kernels/cast_compute_test.cc
index a7316a6162ed9a1bbbaf4956d51ab19c017fd3e4..86331bb8a1cce89da76d2ebb87a9d091e34f68c5 100644
--- a/lite/tests/kernels/cast_compute_test.cc
+++ b/lite/tests/kernels/cast_compute_test.cc
@@ -119,35 +119,6 @@ class CastComputeTester : public arena::TestCase {
         LOG(FATAL) << "unsupported data type: " << in_dtype_;
         break;
     }
-
-    PrecisionType out_ptype;
-    switch (out_dtype_) {
-      case 0:
-        out_ptype = PRECISION(kBool);
-        break;
-      case 21:
-        out_ptype = PRECISION(kInt8);
-        break;
-      case 1:
-        out_ptype = PRECISION(kInt16);
-        break;
-      case 2:
-        out_ptype = PRECISION(kInt32);
-        break;
-      case 3:
-        out_ptype = PRECISION(kInt64);
-        break;
-      case 4:
-        out_ptype = PRECISION(kFP16);
-        break;
-      case 5:
-        out_ptype = PRECISION(kFloat);
-        break;
-      default:
-        LOG(FATAL) << "unsupported data type: " << out_dtype_;
-        break;
-    }
-    SetPrecisionType(out_, out_ptype);
   }
 };
 
diff --git a/lite/tests/kernels/compare_compute_test.cc b/lite/tests/kernels/compare_compute_test.cc
index fe27579fe4a3176b140756933f2f3aa7231eb048..abb9a9d503c1b68a9022d2347122906a4a4d5a69 100644
--- a/lite/tests/kernels/compare_compute_test.cc
+++ b/lite/tests/kernels/compare_compute_test.cc
@@ -16,12 +16,14 @@
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 
 namespace paddle {
 namespace lite {
+
 #define COMPARE_FUNCTOR(name, op)                                           \
   template <typename T>                                                     \
-  struct _##name##Functor {                                                 \
+  struct name##Functor {                                                    \
     inline bool operator()(const T& a, const T& b) const { return a op b; } \
   };
 
@@ -33,7 +35,7 @@ COMPARE_FUNCTOR(GreaterThan, >);
 COMPARE_FUNCTOR(GreaterEqual, >=);
 
 template <>
-struct _EqualFunctor<float> {
+struct EqualFunctor<float> {
   inline bool operator()(const float& a, const float& b) const {
     // It is safe to cast a and b to double.
     return fabs(static_cast<double>(a - b)) < 1e-8;
@@ -41,59 +43,56 @@ struct _EqualFunctor<float> {
 };
 
 template <>
-struct _NotEqualFunctor<float> {
+struct NotEqualFunctor<float> {
   inline bool operator()(const float& a, const float& b) const {
-    return !_EqualFunctor<float>()(a, b);
+    return !EqualFunctor<float>()(a, b);
   }
 };
 
-template <template <typename T> class Functor>
-class LessThanTester : public arena::TestCase {
+template <typename T, template <typename U> class Functor>
+class CompareComputeTester : public arena::TestCase {
  protected:
-  std::string input_x_ = "x";
-  std::string input_y_ = "y";
-  std::string output_ = "out";
-  int axis_ = 1;
-  bool force_cpu_ = 0;
+  std::string x_ = "x";
+  std::string y_ = "y";
+  std::string out_ = "out";
+  std::string op_ = "less_than";
   DDim x_dims_{{3, 5, 4, 4}};
   DDim y_dims_{{4}};
-  std::string opname_ = "less_than";
+  int axis_ = -1;
+  bool force_cpu_ = false;
 
  public:
-  LessThanTester(const Place& place,
-                 const std::string& alias,
-                 bool force_cpu,
-                 int axis,
-                 DDim x_dims,
-                 DDim y_dims,
-                 const std::string& opname)
+  CompareComputeTester(const Place& place,
+                       const std::string& alias,
+                       const std::string op,
+                       DDim x_dims,
+                       DDim y_dims,
+                       int axis = -1)
       : TestCase(place, alias),
-        axis_(axis),
-        force_cpu_(force_cpu),
+        op_(op),
         x_dims_(x_dims),
         y_dims_(y_dims),
-        opname_(opname) {}
+        axis_(axis) {}
 
   void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
+    auto* out = scope->NewTensor(out_);
     CHECK(out);
     out->Resize(x_dims_);
     auto* out_data = out->mutable_data<bool>();
     auto axis = axis_;
-    auto* x = scope->FindTensor(input_x_);
-    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(input_y_);
-    auto* y_data_in = y->data<float>();
+    auto* x = scope->FindTensor(x_);
+    const auto* x_data = x->data<T>();
+    auto* y = scope->FindTensor(y_);
+    auto* y_data_in = y->data<T>();
 
-    using CompareFunc = Functor<float>;
+    using CompareFunc = Functor<T>;
     if (x_dims_.size() == y_dims_.size()) {
       for (int i = 0; i < x_dims_.production(); i++) {
-        // out_data[i] = x_data[i] < y_data[i];
         out_data[i] = CompareFunc()(x_data[i], y_data_in[i]);
       }
     } else {
-      auto* y_data = reinterpret_cast<float*>(
-          malloc(x_dims_.production() * sizeof(float)));
+      auto* y_data =
+          reinterpret_cast<T*>(malloc(x_dims_.production() * sizeof(T)));
 
       if (axis < 0) {
         axis = x_dims_.size() - y_dims_.size();
@@ -111,12 +110,12 @@ class LessThanTester : public arena::TestCase {
         num *= x_dims_[i];
       }
       int ysize = channels * num;
-      float* y_data_t = reinterpret_cast<float*>(y_data);
+      T* y_data_t = reinterpret_cast<T*>(y_data);
       if (num == 1) {
         for (int i = 0; i < batch; ++i) {
           memcpy(reinterpret_cast<void*>(y_data_t),
                  reinterpret_cast<const void*>(&y_data_in[0]),
-                 ysize * sizeof(float));
+                 ysize * sizeof(T));
           y_data_t += ysize;
         }
 
@@ -126,118 +125,118 @@ class LessThanTester : public arena::TestCase {
             y_data_t[i * num + j] = y_data_in[i];
           }
         }
-        float* tempptr = y_data_t;
+        T* tempptr = y_data_t;
         for (int i = 0; i < batch; ++i) {
-          memcpy(y_data_t, tempptr, ysize * sizeof(float));
+          memcpy(y_data_t, tempptr, ysize * sizeof(T));
           y_data_t += ysize;
         }
       }
       for (int i = 0; i < x_dims_.production(); i++) {
-        // out_data[i] = x_data[i] < y_data[i];
         out_data[i] = CompareFunc()(x_data[i], y_data[i]);
       }
     }
   }
 
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType(opname_);
-    op_desc->SetInput("X", {input_x_});
-    op_desc->SetInput("Y", {input_y_});
-    op_desc->SetOutput("Out", {output_});
+    op_desc->SetType(op_);
+    op_desc->SetInput("X", {x_});
+    op_desc->SetInput("Y", {y_});
+    op_desc->SetOutput("Out", {out_});
     op_desc->SetAttr("axis", axis_);
     op_desc->SetAttr("force_cpu", force_cpu_);
   }
 
   void PrepareData() override {
-    std::vector<float> data(x_dims_.production());
-    std::vector<float> datay(
-        y_dims_.production());  // datay(dims_.production());
-    for (int i = 0; i < x_dims_.production(); i++) {
-      data[i] = 1.1;
-    }
-    for (int i = 0; i < y_dims_.production(); i++) {
-      datay[i] = i;
-    }
-    SetCommonTensor(input_x_, x_dims_, data.data());
-    SetCommonTensor(input_y_, y_dims_, datay.data());
+    std::vector<T> dx(x_dims_.production());
+    std::vector<T> dy(y_dims_.production());
+    fill_data_rand<T>(dx.data(), -5, 5, x_dims_.production());
+    fill_data_rand<T>(dy.data(), -5, 5, y_dims_.production());
+    SetCommonTensor(x_, x_dims_, dx.data());
+    SetCommonTensor(y_, y_dims_, dy.data());
   }
 };
-void test_compare(Place place) {
-  for (bool force_cpu : {0}) {
-    for (auto n : {1, 3, 4}) {
-      for (auto c : {1, 3, 4}) {
-        for (auto h : {1, 3, 4}) {
-          for (auto w : {1, 3, 4}) {
-            for (auto axis : {-1, 0, 1, 3}) {
-              for (auto yd : {std::vector<int64_t>({n}),
-                              std::vector<int64_t>({c}),
-                              std::vector<int64_t>({h}),
-                              std::vector<int64_t>({w}),
-                              std::vector<int64_t>({n, c}),
-                              std::vector<int64_t>({h, w}),
-                              std::vector<int64_t>({n, c, h}),
-                              std::vector<int64_t>({n, c, h, w})}) {
-                DDimLite x_dims = DDim(std::vector<int64_t>({n, c, h, w}));
-                DDimLite y_dims = DDim(yd);
-                int axis_t = axis < 0 ? x_dims.size() - y_dims.size() : axis;
-
-                if (axis_t + y_dims.size() > 4) continue;
-                bool flag = false;
-                for (int i = 0; i < y_dims.size(); i++) {
-                  if (x_dims[i + axis_t] != y_dims[i]) flag = true;
-                }
-                if (flag) continue;
-                std::unique_ptr<arena::TestCase> less_than_tester(
-                    new LessThanTester<paddle::lite::_LessThanFunctor>(
-                        place,
-                        "def",
-                        force_cpu,
-                        axis,
-                        x_dims,
-                        y_dims,
-                        "less_than"));
-                arena::Arena less_than_arena(
-                    std::move(less_than_tester), place, 0.001);
-                less_than_arena.TestPrecision();
-                std::unique_ptr<arena::TestCase> equal_tester(
-                    new LessThanTester<paddle::lite::_EqualFunctor>(place,
-                                                                    "def",
-                                                                    force_cpu,
-                                                                    axis,
-                                                                    x_dims,
-                                                                    y_dims,
-                                                                    "equal"));
-                arena::Arena equal_arena(std::move(equal_tester), place, 0.001);
-                equal_arena.TestPrecision();
-                std::unique_ptr<arena::TestCase> greater_than_tester(
-                    new LessThanTester<paddle::lite::_GreaterThanFunctor>(
-                        place,
-                        "def",
-                        force_cpu,
-                        axis,
-                        x_dims,
-                        y_dims,
-                        "greater_than"));
-                arena::Arena greater_than_arena(
-                    std::move(greater_than_tester), place, 0.001);
-                greater_than_arena.TestPrecision();
-              }
-            }
-          }
-        }
-      }
-    }
+
+template <typename T>
+void TestCompare(Place place,
+                 float abs_error,
+                 std::string op,
+                 std::vector<int64_t> x_dims,
+                 std::vector<int64_t> y_dims,
+                 int axis) {
+  if (typeid(T) == typeid(float)) {
+    place.precision = PRECISION(kFloat);
+  } else if (typeid(T) == typeid(int32_t)) {
+    place.precision = PRECISION(kInt32);
+  } else if (typeid(T) == typeid(int64_t)) {
+    place.precision = PRECISION(kInt64);
+  } else {
+    LOG(FATAL) << "unsupported dtype";
+  }
+
+  std::unique_ptr<arena::TestCase> tester = nullptr;
+  if (op == "equal") {
+    tester = static_cast<std::unique_ptr<arena::TestCase>>(
+        new CompareComputeTester<T, EqualFunctor>(
+            place, "def", op, DDim(x_dims), DDim(y_dims), axis));
+  } else if (op == "not_equal") {
+    tester = static_cast<std::unique_ptr<arena::TestCase>>(
+        new CompareComputeTester<T, NotEqualFunctor>(
+            place, "def", op, DDim(x_dims), DDim(y_dims), axis));
+  } else if (op == "less_than") {
+    tester = static_cast<std::unique_ptr<arena::TestCase>>(
+        new CompareComputeTester<T, LessThanFunctor>(
+            place, "def", op, DDim(x_dims), DDim(y_dims), axis));
+  } else if (op == "less_equal") {
+    tester = static_cast<std::unique_ptr<arena::TestCase>>(
+        new CompareComputeTester<T, LessEqualFunctor>(
+            place, "def", op, DDim(x_dims), DDim(y_dims), axis));
+  } else if (op == "greater_than") {
+    tester = static_cast<std::unique_ptr<arena::TestCase>>(
+        new CompareComputeTester<T, GreaterThanFunctor>(
+            place, "def", op, DDim(x_dims), DDim(y_dims), axis));
+  } else if (op == "greater_equal") {
+    tester = static_cast<std::unique_ptr<arena::TestCase>>(
+        new CompareComputeTester<T, GreaterEqualFunctor>(
+            place, "def", op, DDim(x_dims), DDim(y_dims), axis));
+  } else {
+    LOG(FATAL) << "unsupported type";
   }
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
 }
-TEST(Compare_OP, precision) {
-// #ifdef LITE_WITH_X86
-// //   Place place(TARGET(kX86));
-// // #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_compare(place);
-#endif
+
+#if defined(LITE_WITH_NPU)
+TEST(Compare_OP_NPU, precision) {
+  Place place{TARGET(kNPU)};
+  float abs_error = 1e-2;
+
+  TestCompare<float>(
+      place, abs_error, "less_than", {2, 3, 4, 5}, {2, 3, 4, 5}, -1);
+  TestCompare<float>(place, abs_error, "less_than", {2, 3, 4}, {2, 3, 4}, 0);
 }
+#elif defined(LITE_WITH_ARM)
+TEST(Compare_OP_ARM, precision) {
+  Place place{TARGET(kARM)};
+  float abs_error = 1e-5;
+  for (auto op : std::vector<std::string>{"equal",
+                                          "not_equal",
+                                          "less_than",
+                                          "less_equal",
+                                          "greater_than",
+                                          "greater_equal"}) {
+    TestCompare<float>(place, abs_error, op, {2, 3, 4, 5}, {2, 3, 4, 5}, -1);
+    TestCompare<float>(place, abs_error, op, {2, 3, 4}, {2, 3, 4}, 0);
+  }
+
+  TestCompare<float>(place, abs_error, "equal", {2, 3, 4}, {3, 4}, 1);
+  TestCompare<float>(place, abs_error, "equal", {2, 3, 4, 5}, {3, 4}, 1);
+  TestCompare<float>(place, abs_error, "equal", {2, 3, 4}, {4}, 2);
+  TestCompare<float>(place, abs_error, "equal", {2, 3, 4, 5}, {5}, 3);
+
+  TestCompare<int32_t>(place, abs_error, "less_than", {3, 4}, {3, 4}, -1);
+  TestCompare<int64_t>(place, abs_error, "less_than", {3, 4}, {3, 4}, -1);
+}
+#endif
 
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/tests/kernels/concat_compute_test.cc b/lite/tests/kernels/concat_compute_test.cc
index 3e30035f1011405ad9beffefd0df91132747a609..18e4701bdf3e99fbb6f76ed9ac78bbbbfda60a1c 100644
--- a/lite/tests/kernels/concat_compute_test.cc
+++ b/lite/tests/kernels/concat_compute_test.cc
@@ -128,7 +128,7 @@ class ConcateComputeTester : public arena::TestCase {
       for (int i = 0; i < x_dims_.production(); i++) {
         x_data[i] = static_cast<float>(i + n);
       }
-      const std::string x_name = "x_tensor_" + std::to_string(n);
+      const std::string x_name = "x_tensor_" + paddle::lite::to_string(n);
       x_vct_.push_back(x_name);
       SetCommonTensor(x_name, x_dims_, x_data.data());
     }
diff --git a/lite/tests/kernels/crf_decoding_compute_test.cc b/lite/tests/kernels/crf_decoding_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7eaed735051f13376db535883ddaaa9609a9f47a
--- /dev/null
+++ b/lite/tests/kernels/crf_decoding_compute_test.cc
@@ -0,0 +1,112 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+
+namespace paddle {
+namespace lite {
+
+class CrfDecodingComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string emission_ = "Emission";
+  std::string transition_ = "Transition";
+  std::string output_ = "ViterbiPath";
+
+ public:
+  CrfDecodingComputeTester(const Place& place, const std::string& alias)
+      : TestCase(place, alias) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* out = scope->NewTensor(output_);
+    CHECK(out);
+    out->Resize({5, 1});
+    LoD out_lod;
+    out_lod.push_back({0, 2, 5});
+    out->set_lod(out_lod);
+
+    std::vector<int64_t> data = {0, 1, 0, 2, 2};
+    auto* out_data = out->mutable_data<int64_t>();
+    for (int i = 0; i < data.size(); i++) {
+      out_data[i] = data[i];
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("crf_decoding");
+    op_desc->SetInput("Emission", {emission_});
+    op_desc->SetInput("Transition", {transition_});
+    op_desc->SetOutput("ViterbiPath", {output_});
+  }
+
+  void PrepareData() override {
+    std::vector<float> emission_data = {0.39293837,
+                                        -0.42772133,
+                                        -0.54629709,
+                                        0.10262954,
+                                        0.43893794,
+                                        -0.15378708,
+                                        0.9615284,
+                                        0.36965948,
+                                        -0.0381362,
+                                        -0.21576496,
+                                        -0.31364397,
+                                        0.45809941};
+    LoD lod;
+    lod.push_back({0, 2, 5});
+    SetCommonTensor(emission_, DDim({5, 3}), emission_data.data(), lod);
+
+    std::vector<float> transition_data = {0.2379954057320357,
+                                          -0.3175082695465,
+                                          -0.32454824385250747,
+                                          0.03155137384183837,
+                                          0.03182758709686606,
+                                          0.13440095855132106,
+                                          0.34943179407778957,
+                                          0.22445532486063524,
+                                          0.11102351067758287,
+                                          0.22244338257022156,
+                                          -0.1770410861468218,
+                                          -0.1382113443776859,
+                                          -0.2717367691210444,
+                                          -0.20628595361117064,
+                                          0.13097612385448776};
+    SetCommonTensor(transition_, DDim({5, 3}), transition_data.data());
+  }
+};
+
+TEST(CrfDecoding, arm_precision) {
+  LOG(INFO) << "test crf_decoding op";
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kHost));
+  std::unique_ptr<arena::TestCase> tester(
+      new CrfDecodingComputeTester(place, "def"));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+#endif
+
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kHost));
+  std::unique_ptr<arena::TestCase> tester(
+      new CrfDecodingComputeTester(place, "def"));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/elementwise_compute_test.cc b/lite/tests/kernels/elementwise_compute_test.cc
index 66b6223dc45c8fa405d67be2f882ab8445644632..505ab72dc125d5b527845f4695a444c215422f8b 100644
--- a/lite/tests/kernels/elementwise_compute_test.cc
+++ b/lite/tests/kernels/elementwise_compute_test.cc
@@ -234,7 +234,7 @@ TEST(Elementwise, precision) {
   return;
 #endif
 
-  // TestEltDims(place, abs_error);
+  TestEltDims(place, abs_error);
   TestEltTypes(place, abs_error);
   TestEltFuseAct(place, abs_error);
 }
diff --git a/lite/tests/kernels/elementwise_grad_compute_test.cc b/lite/tests/kernels/elementwise_grad_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b5fbbb65d3d7e17bf90afb71f5c8154f0d88488
--- /dev/null
+++ b/lite/tests/kernels/elementwise_grad_compute_test.cc
@@ -0,0 +1,541 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/elementwise_grad_compute.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/arm/elementwise_compute.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+using param_t = operators::ElementwiseParam;
+using grad_param_t = operators::ElementwiseGradParam;
+using kernel_add_t = ElementwiseAddCompute;
+using grad_kernel_add_t = ElementwiseAddGradCompute;
+using kernel_sub_t = ElementwiseSubCompute;
+using grad_kernel_sub_t = ElementwiseSubGradCompute;
+
+void elementwise_common(grad_param_t& param,           // NOLINT
+                        std::vector<float>& out_grad,  // NOLINT
+                        std::vector<float>& x_grad,    // NOLINT
+                        std::vector<float>& y_grad,    // NOLINT
+                        std::string flag) {
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  if (x_dims == y_dims) {
+    for (int i = 0; i < x_dims.production(); ++i) {
+      if (flag == "add") {
+        x_grad[i] = out_grad[i];
+        y_grad[i] = out_grad[i];
+      }
+      if (flag == "sub") {
+        x_grad[i] = out_grad[i];
+        y_grad[i] = -out_grad[i];
+      }
+    }
+  } else {
+    LOG(FATAL) << "unsupport dims";
+  }
+}
+
+class ElementwiseAddGradTester {
+ public:
+  explicit ElementwiseAddGradTester(const DDim& x_dims,
+                                    const DDim& y_dims,
+                                    int axis)
+      : x_dims_(x_dims), y_dims_(y_dims), axis_(axis) {}
+
+  void prepare_kernel() {
+    std::unique_ptr<KernelContext> ctx1(new KernelContext);
+    ctx1->As<ARMContext>();
+    kernel_.SetContext(std::move(ctx1));
+
+    std::unique_ptr<KernelContext> ctx3(new KernelContext);
+    ctx3->As<ARMContext>();
+    grad_kernel_.SetContext(std::move(ctx3));
+  }
+
+  void run_forward(param_t* param,
+                   kernel_add_t* kernel,
+                   const std::vector<float>& x_vec,
+                   const std::vector<float>& y_vec,
+                   float* out_vec) {
+    Tensor x;
+    Tensor y;
+    Tensor output;
+    x.Resize(x_dims_);
+    y.Resize(y_dims_);
+    output.Resize(DDim(out_dims_));
+    auto* x_data = x.mutable_data<float>();
+    auto* y_data = y.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_data[i] = x_vec[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_data[i] = y_vec[i];
+    }
+
+    param->X = &x;
+    param->Y = &y;
+    param->Out = &output;
+    param->axis = axis_;
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* output_data = output.mutable_data<float>();
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_vec[i] = output_data[i];
+    }
+  }
+
+  void run_backward(grad_param_t* param,
+                    grad_kernel_add_t* kernel,
+                    const std::vector<float>& x_vec,
+                    const std::vector<float>& y_vec,
+                    const std::vector<float>& out_grad_vec,
+                    float* x_grad_vec,
+                    float* y_grad_vec) {
+    Tensor x;
+    Tensor x_grad;
+    Tensor y;
+    Tensor y_grad;
+    Tensor out_grad;
+    x.Resize(x_dims_);
+    x_grad.Resize(x_dims_);
+    y.Resize(y_dims_);
+    y_grad.Resize(y_dims_);
+    out_grad.Resize(out_dims_);
+    auto* x_data = x.mutable_data<float>();
+    auto* y_data = y.mutable_data<float>();
+    auto* out_grad_data = out_grad.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_data[i] = x_vec[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_data[i] = y_vec[i];
+    }
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_grad_data[i] = out_grad_vec[i];
+    }
+
+    param->X = &x;
+    param->XGrad = &x_grad;
+    param->Y = &y;
+    param->YGrad = &y_grad;
+    param->OutGrad = &out_grad;
+    param->axis = axis_;
+
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* x_grad_data = x_grad.mutable_data<float>();
+    auto* y_grad_data = y_grad.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_grad_vec[i] = x_grad_data[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_grad_vec[i] = y_grad_data[i];
+    }
+  }
+
+  void check_grad(float delta2, float max_grad_delta2) {
+    std::vector<int64_t> out_shape;
+    // infer shape
+    auto x_dim = x_dims_;
+    auto y_dim = y_dims_;
+    if (x_dim == y_dim) {
+      out_dims_ = x_dim;
+    } else {
+      int max_dim = (x_dim.size() > y_dim.size() ? x_dim.size() : y_dim.size());
+      int axis = param_.axis;
+      axis =
+          (axis == -1 ? std::abs(static_cast<int>(x_dim.size() - y_dim.size()))
+                      : axis);
+      std::vector<int64_t> x_dims_array(max_dim);
+      std::vector<int64_t> y_dims_array(max_dim);
+      std::vector<int64_t> out_dims_array(max_dim);
+
+      if (x_dim.size() > y_dim.size()) {
+        for (int i = 0; i < axis; ++i) {
+          y_dims_array[i] = 1;
+        }
+        if (axis + y_dim.size() < max_dim) {
+          for (int i = axis + y_dim.size(); i < max_dim; ++i) {
+            y_dims_array[i] = 1;
+          }
+        }
+        x_dims_array = x_dim.Vectorize();
+        for (int i = 0; i < y_dim.size(); ++i) {
+          y_dims_array[i + axis] = y_dim[i];
+        }
+      } else {
+        for (int i = 0; i < axis; ++i) {
+          x_dims_array[i] = 1;
+        }
+        if (axis + x_dim.size() < max_dim) {
+          for (int i = axis + x_dim.size(); i < max_dim; ++i) {
+            x_dims_array[i] = 1;
+          }
+        }
+        y_dims_array = y_dim.Vectorize();
+        for (int i = 0; i < x_dim.size(); ++i) {
+          x_dims_array[i + axis] = x_dim[i];
+        }
+      }
+      for (int i = 0; i < max_dim; i++) {
+        if (x_dims_array[i] == -1 || y_dims_array[i] == -1) {
+          out_dims_array[i] = -1;
+        } else {
+          out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]);
+        }
+      }
+      out_dims_ = DDim(out_dims_array);
+    }
+    // infer end
+    // forward
+    std::vector<float> x(x_dims_.production());
+    std::vector<float> y(y_dims_.production());
+    std::vector<float> out(out_dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+    fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
+    this->run_forward(&param_, &kernel_, x, y, out.data());
+
+    for (int i = 0; i < x_dims_.production(); i++) {
+      LOG(INFO) << "x_" << i << ": " << x[i];
+    }
+
+    for (int i = 0; i < y_dims_.production(); i++) {
+      LOG(INFO) << "y_" << i << ": " << y[i];
+    }
+
+    for (int i = 0; i < out_dims_.production(); i++) {
+      LOG(INFO) << "out_" << i << ": " << out[i];
+    }
+
+    // backward
+    std::vector<float> out_grad(out_dims_.production());
+    std::vector<float> x_grad(x_dims_.production());
+    std::vector<float> y_grad(y_dims_.production());
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_grad[i] = 1.0;
+    }
+    this->run_backward(&grad_param_,
+                       &grad_kernel_,
+                       x,
+                       y,
+                       out_grad,
+                       x_grad.data(),
+                       y_grad.data());
+
+    for (int i = 0; i < x_grad.size(); i++) {
+      LOG(INFO) << "x_grad_" << i << ": " << x_grad[i];
+    }
+
+    for (int i = 0; i < y_grad.size(); i++) {
+      LOG(INFO) << "y_grad_" << i << ": " << y_grad[i];
+    }
+
+    // get numeric gradient
+    std::vector<float> x_delta(x_dims_.production());
+    std::vector<float> y_delta(y_dims_.production());
+    std::vector<float> out_delta(out_dims_.production());
+    Tensor tensor_x;
+    Tensor tensor_y;
+    tensor_x.Resize(x_dims_);
+    tensor_y.Resize(y_dims_);
+    grad_param_.X = &tensor_x;
+    grad_param_.Y = &tensor_y;
+
+    elementwise_common(grad_param_, out_grad, x_delta, y_delta, "add");
+
+    float max_grad_delta = 0.0005;
+    for (int i = 0; i < x_dims_.production(); i++) {
+      EXPECT_NEAR(x_grad[i], x_delta[i], max_grad_delta);
+      EXPECT_NEAR(y_grad[i], y_delta[i], max_grad_delta);
+    }
+  }
+
+ private:
+  DDim x_dims_;
+  DDim y_dims_;
+  DDim out_dims_;
+  int axis_;
+  kernel_add_t kernel_;
+  grad_kernel_add_t grad_kernel_;
+  param_t param_;
+  grad_param_t grad_param_;
+};
+
+class ElementwiseSubGradTester {
+ public:
+  explicit ElementwiseSubGradTester(const DDim& x_dims,
+                                    const DDim& y_dims,
+                                    int axis)
+      : x_dims_(x_dims), y_dims_(y_dims), axis_(axis) {}
+
+  void prepare_kernel() {
+    std::unique_ptr<KernelContext> ctx1(new KernelContext);
+    ctx1->As<ARMContext>();
+    kernel_.SetContext(std::move(ctx1));
+
+    std::unique_ptr<KernelContext> ctx3(new KernelContext);
+    ctx3->As<ARMContext>();
+    grad_kernel_.SetContext(std::move(ctx3));
+  }
+
+  void run_forward(param_t* param,
+                   kernel_sub_t* kernel,
+                   const std::vector<float>& x_vec,
+                   const std::vector<float>& y_vec,
+                   float* out_vec) {
+    Tensor x;
+    Tensor y;
+    Tensor output;
+    x.Resize(x_dims_);
+    y.Resize(y_dims_);
+    output.Resize(DDim(out_dims_));
+    auto* x_data = x.mutable_data<float>();
+    auto* y_data = y.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_data[i] = x_vec[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_data[i] = y_vec[i];
+    }
+
+    param->X = &x;
+    param->Y = &y;
+    param->Out = &output;
+    param->axis = axis_;
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* output_data = output.mutable_data<float>();
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_vec[i] = output_data[i];
+    }
+  }
+
+  void run_backward(grad_param_t* param,
+                    grad_kernel_sub_t* kernel,
+                    const std::vector<float>& x_vec,
+                    const std::vector<float>& y_vec,
+                    const std::vector<float>& out_grad_vec,
+                    float* x_grad_vec,
+                    float* y_grad_vec) {
+    Tensor x;
+    Tensor x_grad;
+    Tensor y;
+    Tensor y_grad;
+    Tensor out_grad;
+    x.Resize(x_dims_);
+    x_grad.Resize(x_dims_);
+    y.Resize(y_dims_);
+    y_grad.Resize(y_dims_);
+    out_grad.Resize(out_dims_);
+    auto* x_data = x.mutable_data<float>();
+    auto* y_data = y.mutable_data<float>();
+    auto* out_grad_data = out_grad.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_data[i] = x_vec[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_data[i] = y_vec[i];
+    }
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_grad_data[i] = out_grad_vec[i];
+    }
+
+    param->X = &x;
+    param->XGrad = &x_grad;
+    param->Y = &y;
+    param->YGrad = &y_grad;
+    param->OutGrad = &out_grad;
+    param->axis = axis_;
+
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* x_grad_data = x_grad.mutable_data<float>();
+    auto* y_grad_data = y_grad.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_grad_vec[i] = x_grad_data[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_grad_vec[i] = y_grad_data[i];
+    }
+  }
+
+  void check_grad(float delta2, float max_grad_delta2) {
+    std::vector<int64_t> out_shape;
+    // infer shape
+    auto x_dim = x_dims_;
+    auto y_dim = y_dims_;
+    if (x_dim == y_dim) {
+      out_dims_ = x_dim;
+    } else {
+      int max_dim = (x_dim.size() > y_dim.size() ? x_dim.size() : y_dim.size());
+      int axis = param_.axis;
+      axis =
+          (axis == -1 ? std::abs(static_cast<int>(x_dim.size() - y_dim.size()))
+                      : axis);
+      std::vector<int64_t> x_dims_array(max_dim);
+      std::vector<int64_t> y_dims_array(max_dim);
+      std::vector<int64_t> out_dims_array(max_dim);
+
+      if (x_dim.size() > y_dim.size()) {
+        for (int i = 0; i < axis; ++i) {
+          y_dims_array[i] = 1;
+        }
+        if (axis + y_dim.size() < max_dim) {
+          for (int i = axis + y_dim.size(); i < max_dim; ++i) {
+            y_dims_array[i] = 1;
+          }
+        }
+        x_dims_array = x_dim.Vectorize();
+        for (int i = 0; i < y_dim.size(); ++i) {
+          y_dims_array[i + axis] = y_dim[i];
+        }
+      } else {
+        for (int i = 0; i < axis; ++i) {
+          x_dims_array[i] = 1;
+        }
+        if (axis + x_dim.size() < max_dim) {
+          for (int i = axis + x_dim.size(); i < max_dim; ++i) {
+            x_dims_array[i] = 1;
+          }
+        }
+        y_dims_array = y_dim.Vectorize();
+        for (int i = 0; i < x_dim.size(); ++i) {
+          x_dims_array[i + axis] = x_dim[i];
+        }
+      }
+      for (int i = 0; i < max_dim; i++) {
+        if (x_dims_array[i] == -1 || y_dims_array[i] == -1) {
+          out_dims_array[i] = -1;
+        } else {
+          out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]);
+        }
+      }
+      out_dims_ = DDim(out_dims_array);
+    }
+    // infer end
+    // forward
+    std::vector<float> x(x_dims_.production());
+    std::vector<float> y(y_dims_.production());
+    std::vector<float> out(out_dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+    fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
+    this->run_forward(&param_, &kernel_, x, y, out.data());
+
+    for (int i = 0; i < x_dims_.production(); i++) {
+      LOG(INFO) << "x_" << i << ": " << x[i];
+    }
+
+    for (int i = 0; i < y_dims_.production(); i++) {
+      LOG(INFO) << "y_" << i << ": " << y[i];
+    }
+
+    for (int i = 0; i < out_dims_.production(); i++) {
+      LOG(INFO) << "out_" << i << ": " << out[i];
+    }
+
+    // backward
+    std::vector<float> out_grad(out_dims_.production());
+    std::vector<float> x_grad(x_dims_.production());
+    std::vector<float> y_grad(y_dims_.production());
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_grad[i] = 1.0;
+    }
+    this->run_backward(&grad_param_,
+                       &grad_kernel_,
+                       x,
+                       y,
+                       out_grad,
+                       x_grad.data(),
+                       y_grad.data());
+
+    for (int i = 0; i < x_grad.size(); i++) {
+      LOG(INFO) << "x_grad_" << i << ": " << x_grad[i];
+    }
+
+    for (int i = 0; i < y_grad.size(); i++) {
+      LOG(INFO) << "y_grad_" << i << ": " << y_grad[i];
+    }
+
+    // get numeric gradient
+    std::vector<float> x_delta(x_dims_.production());
+    std::vector<float> y_delta(y_dims_.production());
+    std::vector<float> out_delta(out_dims_.production());
+    Tensor tensor_x;
+    Tensor tensor_y;
+    tensor_x.Resize(x_dims_);
+    tensor_y.Resize(y_dims_);
+    grad_param_.X = &tensor_x;
+    grad_param_.Y = &tensor_y;
+
+    elementwise_common(grad_param_, out_grad, x_delta, y_delta, "sub");
+
+    float max_grad_delta = 0.0005;
+    for (int i = 0; i < x_dims_.production(); i++) {
+      EXPECT_NEAR(x_grad[i], x_delta[i], max_grad_delta);
+      EXPECT_NEAR(y_grad[i], y_delta[i], max_grad_delta);
+    }
+  }
+
+ private:
+  DDim x_dims_;
+  DDim y_dims_;
+  DDim out_dims_;
+  int axis_;
+  kernel_sub_t kernel_;
+  grad_kernel_sub_t grad_kernel_;
+  param_t param_;
+  grad_param_t grad_param_;
+};
+void TestNormalCase(const std::vector<int64_t>& x_dims,
+                    const std::vector<int64_t>& y_dims,
+                    int axis) {
+  std::unique_ptr<ElementwiseAddGradTester> tester_add(
+      new ElementwiseAddGradTester(DDim(x_dims), DDim(y_dims), axis));
+  std::unique_ptr<ElementwiseSubGradTester> tester_sub(
+      new ElementwiseSubGradTester(DDim(x_dims), DDim(y_dims), axis));
+
+  tester_add->prepare_kernel();
+  tester_sub->prepare_kernel();
+  float delta = 0.001;
+  float max_grad_delta = 0.005;
+  tester_add->check_grad(delta, max_grad_delta);
+  tester_sub->check_grad(delta, max_grad_delta);
+}
+
+TEST(mul_grad_arm, compute) {
+  LOG(INFO) << "Test Elementwise grad";
+  DeviceInfo::Init();
+  TestNormalCase({3, 2}, {3, 2}, 0);
+  TestNormalCase({3, 5}, {3, 5}, 1);
+  TestNormalCase({3, 4, 3}, {3, 4, 3}, 0);
+  TestNormalCase({9, 2, 5}, {9, 2, 5}, 1);
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(elementwise_add_grad, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
diff --git a/lite/tests/kernels/fill_constant_batch_size_like_compute_test.cc b/lite/tests/kernels/fill_constant_batch_size_like_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2318d53a33866fd8ba61d14c4d6bc6aed283dbdc
--- /dev/null
+++ b/lite/tests/kernels/fill_constant_batch_size_like_compute_test.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class FillConstantBatchSizeLikeComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string input_ = "input";
+  std::string out_ = "out";
+  DDim in_dims_{};
+  LoD in_lod_{};
+  std::vector<int> shape_{};
+  float value_{0.f};
+  int input_dim_idx_{0};
+  int output_dim_idx_{0};
+  int dtype_{static_cast<int>(VarDescAPI::VarDataType::FP32)};
+
+ public:
+  FillConstantBatchSizeLikeComputeTester(
+      const Place& place,
+      const std::string& alias,
+      DDim in_dims,
+      LoD in_lod,
+      std::vector<int> shape,
+      float value = 0.f,
+      int input_dim_idx = 0,
+      int output_dim_idx = 0,
+      int dtype = static_cast<int>(VarDescAPI::VarDataType::FP32))
+      : TestCase(place, alias),
+        in_dims_(in_dims),
+        in_lod_(in_lod),
+        shape_(shape),
+        value_(value),
+        input_dim_idx_(input_dim_idx),
+        output_dim_idx_(output_dim_idx),
+        dtype_(dtype) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* out = scope->NewTensor(out_);
+    auto* input = scope->FindTensor(input_);
+    std::vector<int64_t> out_shape{shape_.begin(), shape_.end()};
+    if (input_dim_idx_ == 0 && !input->lod().empty()) {
+      out_shape[output_dim_idx_] = input->lod().back().size() - 1;
+    } else {
+      out_shape[output_dim_idx_] = input->dims()[input_dim_idx_];
+    }
+    out->Resize(out_shape);
+
+    auto* output_data = out->mutable_data<float>();
+    for (int i = 0; i < out->numel(); i++) {
+      output_data[i] = value_;
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("fill_constant_batch_size_like");
+    op_desc->SetInput("Input", {input_});
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetAttr("shape", shape_);
+    op_desc->SetAttr("value", value_);
+    op_desc->SetAttr("input_dim_idx", input_dim_idx_);
+    op_desc->SetAttr("output_dim_idx", output_dim_idx_);
+    op_desc->SetAttr("dtype", dtype_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> din(in_dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, in_dims_.production());
+    SetCommonTensor(input_, in_dims_, din.data(), in_lod_);
+  }
+};
+
+void TestFillConstantBatchSizeLike(Place place, float abs_error) {
+  for (auto input_dim_idx : {0, 1, 2}) {
+    for (auto output_dim_idx : {0, 1, 2}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new FillConstantBatchSizeLikeComputeTester(place,
+                                                     "def",
+                                                     DDim{{5, 4, 3}},
+                                                     {},
+                                                     {2, 3, 4},
+                                                     0.f,
+                                                     input_dim_idx,
+                                                     output_dim_idx));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestFillConstantBatchSizeLikeLod(Place place, float abs_error) {
+  for (auto lod : std::vector<LoD>{{{0, 1, 4, 5}}, {{0, 2, 4}, {0, 1, 4, 5}}}) {
+    std::unique_ptr<arena::TestCase> tester(
+        new FillConstantBatchSizeLikeComputeTester(
+            place, "def", DDim{{5, 4, 3}}, lod, {2, 3, 4}, 0.f));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
+void TestFillConstantBatchSizeLikeValue(Place place, float abs_error) {
+  std::vector<float> values{-1., 3.5};
+  for (auto value : values) {
+    std::unique_ptr<arena::TestCase> tester(
+        new FillConstantBatchSizeLikeComputeTester(
+            place, "def", DDim{{5, 4, 3}}, {}, {2, 3}, value));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
+TEST(fill_constant_batch_size_like, precision) {
+  LOG(INFO) << "test fill_constant_batch_size_like op";
+  Place place;
+  float abs_error = 1e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
+
+  TestFillConstantBatchSizeLike(place, abs_error);
+  TestFillConstantBatchSizeLikeLod(place, abs_error);
+  TestFillConstantBatchSizeLikeValue(place, abs_error);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/fill_constant_compute_test.cc b/lite/tests/kernels/fill_constant_compute_test.cc
index e211582b04d279b535f0d3873a9b0c537e375a60..bc2cfce7842c935898bd9ecddc6c2d0ac4c39af5 100644
--- a/lite/tests/kernels/fill_constant_compute_test.cc
+++ b/lite/tests/kernels/fill_constant_compute_test.cc
@@ -24,60 +24,57 @@ class FillConstantComputeTester : public arena::TestCase {
  protected:
   // common attributes for this op.
   std::string out_ = "out";
-  int dtype_{static_cast<int>(VarDescAPI::VarDataType::FP32)};
+  std::string shape_tensor_ = "shape_tensor";
+  std::vector<std::string> shape_tensor_list_{};
+
   std::vector<int64_t> shape_{};
-  std::string shape_tensor_ = "ShapeTensor";
-  std::vector<std::string> shape_tensor_list_;
+  float value_{0.0f};
+  int dtype_{static_cast<int>(VarDescAPI::VarDataType::FP32)};
+
   bool is_use_shape_tensor_{false};
   bool is_use_shape_tensor_list_{false};
-
-  float value_{0.0f};
   // useless for x86, keep it for compatibility
   bool force_cpu_{false};
-  // DDim shape_tensor_data{{5, 3}};
-  std::vector<int32_t> shape_tensor_data;
-  DDim shape_test{{1, 2}};
 
  public:
   FillConstantComputeTester(const Place& place,
                             const std::string& alias,
                             std::vector<int64_t> shape,
-                            const bool is_use_shape_tensor,
-                            const bool is_use_shape_tensor_list,
                             float value,
-                            bool force_cpu)
-      : TestCase(place, alias) {
-    shape_ = shape;
-    value_ = value;
-    force_cpu_ = force_cpu;
-    is_use_shape_tensor_ = is_use_shape_tensor;
-    is_use_shape_tensor_list_ = is_use_shape_tensor_list;
-
-    for (int i = 0; i < shape_test.size(); i++) {
-      shape_tensor_data.push_back(i + 1);
+                            int dtype,
+                            const bool is_use_shape_tensor = false,
+                            const bool is_use_shape_tensor_list = false)
+      : TestCase(place, alias),
+        shape_(shape),
+        value_(value),
+        dtype_(dtype),
+        is_use_shape_tensor_(is_use_shape_tensor),
+        is_use_shape_tensor_list_(is_use_shape_tensor_list) {
+    if (is_use_shape_tensor_list) {
+      for (int i = 0; i < shape.size(); i++) {
+        shape_tensor_list_.push_back(shape_tensor_ +
+                                     paddle::lite::to_string(i));
+      }
     }
   }
 
   void RunBaseline(Scope* scope) override {
     auto* out = scope->NewTensor(out_);
-    DDim output_dims{shape_};
+    std::vector<int64_t> out_shape;
     if (is_use_shape_tensor_) {
-      auto* temp_shape = scope->FindTensor(shape_tensor_);
-      auto* shape_data = temp_shape->data<int>();
-      auto vec_shape =
-          std::vector<int64_t>(shape_data, shape_data + temp_shape->numel());
-      output_dims.ConstructFrom(vec_shape);
-    }
-    if (is_use_shape_tensor_list_) {
-      std::vector<int64_t> vec_shape;
+      auto* shape_tensor = scope->FindTensor(shape_tensor_);
+      auto* shape_tensor_data = shape_tensor->data<int>();
+      out_shape = std::vector<int64_t>(
+          shape_tensor_data, shape_tensor_data + shape_tensor->numel());
+    } else if (is_use_shape_tensor_list_) {
       for (int i = 0; i < shape_tensor_list_.size(); i++) {
-        auto* temp_shape = scope->FindTensor(shape_tensor_list_[i]);
-        vec_shape.push_back(*temp_shape->data<int>());
+        auto* shape_tensor = scope->FindTensor(shape_tensor_list_[i]);
+        out_shape.push_back(shape_tensor->data<int>()[0]);
       }
-
-      output_dims.ConstructFrom(vec_shape);
+    } else {
+      out_shape = shape_;
     }
-    out->Resize(output_dims);
+    out->Resize(out_shape);
 
     auto* output_data = out->mutable_data<float>();
     for (int i = 0; i < out->numel(); i++) {
@@ -86,92 +83,107 @@ class FillConstantComputeTester : public arena::TestCase {
   }
 
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    LOG(INFO) << "PrepareOpDesc";
-
     op_desc->SetType("fill_constant");
-    op_desc->SetAttr("dtype", dtype_);
-    op_desc->SetAttr("shape", shape_);
-    op_desc->SetAttr("value", value_);
-    op_desc->SetAttr("force_cpu", force_cpu_);
     if (is_use_shape_tensor_) {
       op_desc->SetInput("ShapeTensor", {shape_tensor_});
-    }
-    if (is_use_shape_tensor_list_) {
-      // std::vector<std::string> shape_tensor_list_;
-      for (int i = 0; i < shape_test.size(); ++i) {
-        shape_tensor_list_.push_back("shape_tensor_list_" + std::to_string(i));
-      }
-      op_desc->SetInput("ShapeTensorList", {shape_tensor_list_});
+    } else if (is_use_shape_tensor_list_) {
+      op_desc->SetInput("ShapeTensorList", shape_tensor_list_);
+    } else {
+      op_desc->SetAttr("shape", shape_);
     }
     op_desc->SetOutput("Out", {out_});
+    op_desc->SetAttr("dtype", dtype_);
+    op_desc->SetAttr("value", value_);
+    op_desc->SetAttr("force_cpu", force_cpu_);
   }
 
   void PrepareData() override {
     if (is_use_shape_tensor_) {
-      // std::vector<int64_t> temp = x_dims_.data();
-      // int64_t* data = temp.data();
-      SetCommonTensor(shape_tensor_, shape_test, shape_tensor_data.data());
+      std::vector<int> dshape_tensor(shape_.begin(), shape_.end());
+      SetCommonTensor(shape_tensor_,
+                      DDim({static_cast<int64_t>(shape_.size())}),
+                      dshape_tensor.data());
     }
     if (is_use_shape_tensor_list_) {
-      Scope& scope_ = this->scope();
-      for (int i = 0; i < shape_test.size(); ++i) {
-        auto* tensor =
-            scope_.NewTensor("shape_tensor_list_" + std::to_string(i));
-        tensor->Resize(DDim({1}));
-        auto* d = tensor->mutable_data<int>();
-        d[0] = shape_tensor_data[i];
+      for (int i = 0; i < shape_.size(); ++i) {
+        std::vector<int> dshape_tensor{static_cast<int>(shape_[i])};
+        SetCommonTensor(shape_tensor_list_[i], DDim({1}), dshape_tensor.data());
       }
     }
   }
 };
 
-TEST(fill_constant, precision) {
-  LOG(INFO) << "test fill_constant op, kARM";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  std::vector<int64_t> shape{1, 2};
-
-  for (int dtype : {static_cast<int>(VarDescAPI::VarDataType::INT32)}) {
-    for (float value : {1, 2}) {
-      for (bool is_use_shape_tensor_list : {false, true}) {
-        for (bool is_use_shape_tensor : {false, true}) {
-          if (is_use_shape_tensor && is_use_shape_tensor_list) break;
-          LOG(INFO) << "value:" << value
-                    << ", is_use_shape_tensor:" << is_use_shape_tensor
-                    << ", is_use_shape_tensor_list:"
-                    << is_use_shape_tensor_list;
-
-          std::unique_ptr<arena::TestCase> tester(
-              new FillConstantComputeTester(place,
-                                            "def",
-                                            shape,
-                                            is_use_shape_tensor,
-                                            is_use_shape_tensor_list,
-                                            value,
-                                            false));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
+void TestFillConstantShape(Place place, float abs_error) {
+  std::vector<std::vector<int64_t>> out_shapes{
+      {2, 3, 4, 5}, {2, 3, 4}, {3, 4}, {4}};
+  for (auto out_shape : out_shapes) {
+    std::unique_ptr<arena::TestCase> tester(new FillConstantComputeTester(
+        place,
+        "def",
+        out_shape,
+        1.f,
+        static_cast<int>(VarDescAPI::VarDataType::FP32)));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
   }
-#endif
+}
 
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-  LOG(INFO) << "test concate op, x86";
-  for (int axis : {1, 2}) {
-    for (bool is_use_axis_tensor : {false, true}) {
-      LOG(INFO) << "axis:" << axis
-                << ", is_use_axis_tensor:" << is_use_axis_tensor;
-      std::unique_ptr<arena::TestCase> tester(
-          new ConcateComputeTester(place, "def", axis, is_use_axis_tensor));
-      arena::Arena arena(std::move(tester), place, 2e-5);
-      arena.TestPrecision();
-    }
+void TestFillConstantValue(Place place, float abs_error) {
+  std::vector<float> values{-1., 0., 3.5};
+  for (auto value : values) {
+    std::unique_ptr<arena::TestCase> tester(new FillConstantComputeTester(
+        place,
+        "def",
+        {2, 3},
+        value,
+        static_cast<int>(VarDescAPI::VarDataType::FP32)));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
   }
+}
+
+void TestFillConstantShapeTensor(Place place, float abs_error) {
+  std::unique_ptr<arena::TestCase> tester(new FillConstantComputeTester(
+      place,
+      "def",
+      {2, 3, 4},
+      1.f,
+      static_cast<int>(VarDescAPI::VarDataType::FP32),
+      true));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
 
+void TestFillConstantShapeTensorList(Place place, float abs_error) {
+  std::unique_ptr<arena::TestCase> tester(new FillConstantComputeTester(
+      place,
+      "def",
+      {2, 3, 4},
+      1.f,
+      static_cast<int>(VarDescAPI::VarDataType::FP32),
+      false,
+      true));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
+
+TEST(fill_constant, precision) {
+  LOG(INFO) << "test fill_constant op";
+  Place place;
+  float abs_error = 1e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  TestFillConstantShape(place, abs_error);
+  TestFillConstantValue(place, abs_error);
+  TestFillConstantShapeTensor(place, abs_error);
+  TestFillConstantShapeTensorList(place, abs_error);
 }
 
 }  // namespace lite
diff --git a/lite/tests/kernels/gather_compute_test.cc b/lite/tests/kernels/gather_compute_test.cc
index 9db225b2cd9021565a4230d497fcd73f846ce9cb..750b4c42d40a52894a90700cf48838c5c9a4980c 100644
--- a/lite/tests/kernels/gather_compute_test.cc
+++ b/lite/tests/kernels/gather_compute_test.cc
@@ -94,7 +94,9 @@ TEST(Gather, precision) {
   LOG(INFO) << "test gather op";
   float abs_error = 2e-5;
   Place place;
-#if defined(LITE_WITH_XPU)
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#elif defined(LITE_WITH_XPU)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/increment_compute_test.cc b/lite/tests/kernels/increment_compute_test.cc
index 07c1e86c192bda9eef5906df11587bd8b4ee4ee8..d5f5ac5cc467bb9628d1dddb518c04a5184b980c 100644
--- a/lite/tests/kernels/increment_compute_test.cc
+++ b/lite/tests/kernels/increment_compute_test.cc
@@ -16,6 +16,7 @@
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 
 namespace paddle {
 namespace lite {
@@ -58,36 +59,40 @@ class IncrementComputeTester : public arena::TestCase {
   }
 
   void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -5.f, 5.f, dims_.production());
+    SetCommonTensor(input_, dims_, din.data());
   }
 };
-void test_increment(Place place) {
-  DDimLite dims_0{{3, 5, 4, 4}};
-  DDimLite dims_1{{3, 5}};
-  for (auto dims : {dims_0, dims_1}) {
+
+void test_increment(Place place, float abs_error) {
+  std::vector<std::vector<int64_t>> x_dims{{3, 5, 4, 4}, {3, 5}, {1}};
+  for (auto dims : x_dims) {
     for (float step : {1, 2}) {
+#if LITE_WITH_NPU
+      if (dims.size() != 1) continue;
+#endif
       std::unique_ptr<arena::TestCase> tester(
-          new IncrementComputeTester(place, "def", step, dims));
-      arena::Arena arena(std::move(tester), place, 2e-5);
+          new IncrementComputeTester(place, "def", step, DDim(dims)));
+      arena::Arena arena(std::move(tester), place, abs_error);
       arena.TestPrecision();
     }
   }
 }
 
 TEST(Increment, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_increment(place);
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_increment(place, abs_error);
 }
 
 }  // namespace lite
diff --git a/lite/tests/kernels/lookup_table_compute_test.cc b/lite/tests/kernels/lookup_table_compute_test.cc
index 5951601ef93a8cff2c007df5ed2f2645735b98e2..8c55e31863ff7b38c51e751583d4a92b1f185d86 100644
--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
@@ -111,7 +111,9 @@ TEST(LookupTable, precision) {
   LOG(INFO) << "test lookup_table op";
   float abs_error = 2e-5;
   Place place;
-#if defined(LITE_WITH_XPU)
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#elif defined(LITE_WITH_XPU)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/lookup_table_dequant_compute_test.cc b/lite/tests/kernels/lookup_table_dequant_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fa28d7b8bc8c1bac2570c443b5adee9af299100e
--- /dev/null
+++ b/lite/tests/kernels/lookup_table_dequant_compute_test.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+void dequant(const unsigned char* in,
+             float* out,
+             float min,
+             float max,
+             int emb_size,
+             int pow_2_bits) {
+  float scale = (max - min) / pow_2_bits;
+  for (int i = 0; i < emb_size; ++i) {
+    float x = scale * static_cast<int>(in[i]) + min;
+    out[i] = x;
+  }
+}
+
+class LookupTableDequantComputeTest : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string op_type_ = "lookup_table_dequant";
+  std::string ids_ = "ids";
+  std::string w_ = "w";
+  std::string out_ = "out";
+  DDim ids_dims_{{2, 1}};
+  DDim w_dims_{{8, 4}};
+  int64_t padding_idx_ = -1;
+
+ public:
+  LookupTableDequantComputeTest(const Place& place,
+                                const std::string& alias,
+                                const DDim& ids_dims,
+                                const DDim& w_dims,
+                                int64_t padding_idx)
+      : TestCase(place, alias),
+        ids_dims_(ids_dims),
+        w_dims_(w_dims),
+        padding_idx_(padding_idx) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto ids = scope->FindTensor(ids_);
+    auto w = scope->FindTensor(w_);
+    auto ids_dims = ids->dims();
+    auto w_dims = w->dims();
+
+    auto out = scope->NewTensor(out_);
+    CHECK(out);
+
+    int ids_rank = ids_dims.size();
+    CHECK_EQ(ids_dims[ids_rank - 1], 1);
+    CHECK_EQ(w_dims.size(), 2);
+
+    std::vector<int64_t> out_dims;
+    for (int i = 0; i < ids_rank - 1; ++i) {
+      out_dims.push_back(ids_dims[i]);
+    }
+    out_dims.push_back((w_dims[1] - 2) * 4);
+    out->Resize(out_dims);
+    out->set_lod(ids->lod());
+
+    auto ids_data = ids->data<int64_t>();
+    auto ids_size = ids_dims.production();
+    auto w_data = w->data<float>();
+    auto w_rows = w_dims[0];
+    auto quant_number = w_dims[1];
+    auto w_cols = (quant_number - 2) * 4;
+    auto out_data = out->mutable_data<float>();
+    int pow_2_bits = static_cast<int>(pow(2, 8));
+
+    for (int64_t i = 0; i < ids_size; i++) {
+      auto id = ids_data[i];
+      if (padding_idx_ != -1 && id == padding_idx_) {
+        memset(out_data + i * w_cols, 0, w_cols * sizeof(float));
+      } else {
+        CHECK_LT(id, w_rows) << "lookup_table ids[i] expected < " << w_rows
+                             << " but got " << id;
+        CHECK_GE(id, 0) << "lookup_table ids[i] expected >= 0 but got " << id;
+        float min = *(w_data + ids_data[i] * quant_number);
+        float max = *(w_data + ids_data[i] * quant_number + 1);
+        int offset = ids_data[i] * quant_number + 2;
+        const unsigned char* tensor_buf =
+            reinterpret_cast<const unsigned char*>(w_data + offset);
+        dequant(
+            tensor_buf, out_data + i * w_cols, min, max, w_cols, pow_2_bits);
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("Ids", {ids_});
+    op_desc->SetInput("W", {w_});
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetAttr<int64_t>("padding_idx", padding_idx_);
+  }
+
+  void PrepareData() override {
+    std::vector<int64_t> ids(ids_dims_.production());
+    fill_data_rand<int64_t>(
+        ids.data(), 0, w_dims_[0] - 1, ids_dims_.production());
+
+    std::vector<float> w(w_dims_.production());
+    fill_data_rand(w.data(), -1.f, 1.f, w_dims_.production());
+
+    SetCommonTensor(ids_, ids_dims_, ids.data());
+    SetCommonTensor(w_, w_dims_, w.data());
+  }
+};
+
+TEST(LookupTableDequant, precision) {
+#ifdef LITE_WITH_ARM
+  float abs_error = 2e-5;
+  Place place = TARGET(kARM);
+  for (auto ids_dims :
+       std::vector<std::vector<int64_t>>{{5, 2, 3, 1}, {2, 3, 1}, {3, 1}}) {
+    for (auto w_dims :
+         std::vector<std::vector<int64_t>>{{4, 3}, {6, 8}, {12, 15}}) {
+      for (auto padding_idx : std::vector<int64_t>{-1}) {
+        std::unique_ptr<arena::TestCase> tester(
+            new LookupTableDequantComputeTest(
+                place, "def", DDim(ids_dims), DDim(w_dims), padding_idx));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
+      }
+    }
+  }
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/mean_compute_test.cc b/lite/tests/kernels/mean_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..71eb86a6c15a77af92dc2b9c5da4fa2eb6aba406
--- /dev/null
+++ b/lite/tests/kernels/mean_compute_test.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class MeanComputeTester : public arena::TestCase {
+ protected:
+  DDim input_dims_{{2, 5}};
+  std::string input_ = "x";
+  std::string output_ = "out";
+
+ public:
+  MeanComputeTester(const Place& place,
+                    const std::string& alias,
+                    const DDim& input_dims)
+      : TestCase(place, alias), input_dims_(input_dims) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto input = scope->FindTensor(input_);
+    auto output = scope->NewTensor(output_);
+
+    std::vector<int64_t> out_dims{1};
+    output->Resize(out_dims);
+
+    auto input_data = input->data<float>();
+    auto output_data = output->mutable_data<float>();
+
+    int x_size = input_dims_.production();
+    float sum = 0;
+    for (int i = 0; i < x_size; i++) {
+      sum += input_data[i];
+    }
+    output_data[0] = sum / x_size;
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("mean");
+    op_desc->SetInput("X", {input_});
+    op_desc->SetOutput("Out", {output_});
+  }
+
+  void PrepareData() override {
+    std::vector<float> input(input_dims_.production());
+    fill_data_rand(input.data(), -1.f, 1.f, input_dims_.production());
+    SetCommonTensor(input_, input_dims_, input.data());
+  }
+};
+
+void TestNormalCase(Place place, float abs_error = 2e-5) {
+  LOG(INFO) << "Test Mean";
+  for (std::vector<int64_t> dims : std::vector<std::vector<int64_t>>{
+           {5}, {4, 5}, {3, 4, 5}, {2, 3, 4, 5}}) {
+    std::unique_ptr<arena::TestCase> tester(
+        new MeanComputeTester(place, "def", DDim(dims)));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
+#ifdef LITE_WITH_TRAIN
+class MeanGradComputeTester : public arena::TestCase {
+ protected:
+  DDim input_dims_{{2, 5}};
+  DDim output_grad_dims_{{1}};
+  std::string input_ = "x";
+  std::string input_grad_ = "x_grad";
+  std::string output_grad_ = "out_grad";
+
+ public:
+  MeanGradComputeTester(const Place& place,
+                        const std::string& alias,
+                        const DDim& input_dims)
+      : TestCase(place, alias), input_dims_(input_dims) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto input = scope->FindTensor(input_);
+    auto output_grad = scope->FindTensor(output_grad_);
+    auto input_grad = scope->NewTensor(input_grad_);
+
+    input_grad->Resize(input_dims_);
+
+    auto input_data = input->data<float>();
+    auto output_grad_data = output_grad->data<float>();
+    auto input_grad_data = input_grad->mutable_data<float>();
+
+    int x_size = input_dims_.production();
+    float d_x = output_grad_data[0] / x_size;
+
+    for (int i = 0; i < x_size; i++) {
+      input_grad_data[i] = d_x;
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("mean_grad");
+    op_desc->SetInput("X", {input_});
+    op_desc->SetInput("Out@GRAD", {output_grad_});
+    op_desc->SetOutput("X@GRAD", {input_grad_});
+  }
+
+  void PrepareData() override {
+    std::vector<float> input(input_dims_.production());
+    fill_data_rand(input.data(), -1.f, 1.f, input_dims_.production());
+    SetCommonTensor(input_, input_dims_, input.data());
+
+    std::vector<float> output_grad(1);
+    fill_data_rand(output_grad.data(), -1.f, 1.f, 1);
+    SetCommonTensor(output_grad_, output_grad_dims_, output_grad.data());
+  }
+};
+
+void TestGradNormalCase(Place place, float abs_error = 2e-5) {
+  LOG(INFO) << "Test Mean Grad";
+  for (std::vector<int64_t> dims : std::vector<std::vector<int64_t>>{
+           {5}, {4, 5}, {3, 4, 5}, {2, 3, 4, 5}}) {
+    std::unique_ptr<arena::TestCase> tester(
+        new MeanGradComputeTester(place, "def", DDim(dims)));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+#endif
+
+TEST(Mean, precision) {
+#ifdef LITE_WITH_ARM
+  float abs_error = 2e-5;
+  Place place(TARGET(kARM));
+  TestNormalCase(place, abs_error);
+#ifdef LITE_WITH_TRAIN
+  TestGradNormalCase(place, abs_error);
+#endif
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/mul_compute_test.cc b/lite/tests/kernels/mul_compute_test.cc
index d9bbfaa8d049cf2bbcdea9b9c5e58d201e156a67..d070292332b65ed577ec6cefdb220ee691eb99e9 100644
--- a/lite/tests/kernels/mul_compute_test.cc
+++ b/lite/tests/kernels/mul_compute_test.cc
@@ -109,6 +109,7 @@ void TestMul(const std::vector<int64_t>& x_dims,
              int y_num_col_dims,
              const Place& place,
              float abs_error) {
+  LOG(INFO) << "run test arm";
   std::unique_ptr<arena::TestCase> tester(new MulComputeTester(place,
                                                                "def",
                                                                DDim(x_dims),
@@ -131,7 +132,6 @@ TEST(Mul, precision) {
 #else
   return;
 #endif
-
   TestMul({4, 5}, {5, 4}, 1, 1, place, abs_error);
   TestMul({4, 5}, {5, 4, 3, 2}, 1, 1, place, abs_error);
   TestMul({4, 20}, {5, 4, 3, 2}, 1, 2, place, abs_error);
diff --git a/lite/tests/kernels/mul_grad_compute_test.cc b/lite/tests/kernels/mul_grad_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..95cbb2f8b54dd41d6756f7ae0222a34a7bb18c1d
--- /dev/null
+++ b/lite/tests/kernels/mul_grad_compute_test.cc
@@ -0,0 +1,265 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/mul_grad_compute.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/arm/mul_compute.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+using param_t = operators::MulParam;
+using grad_param_t = operators::MulGradParam;
+using kernel_t = MulCompute;
+using grad_kernel_t = MulGradCompute;
+
+class MulGradTester {
+ public:
+  explicit MulGradTester(const DDim& x_dims,
+                         const DDim& y_dims,
+                         int x_num_col_dims,
+                         int y_num_col_dims)
+      : x_dims_(x_dims),
+        y_dims_(y_dims),
+        x_num_col_dims_(x_num_col_dims),
+        y_num_col_dims_(y_num_col_dims) {}
+
+  void prepare_kernel() {
+    std::unique_ptr<KernelContext> ctx1(new KernelContext);
+    ctx1->As<ARMContext>();
+    kernel_.SetContext(std::move(ctx1));
+
+    std::unique_ptr<KernelContext> ctx2(new KernelContext);
+    ctx2->As<ARMContext>();
+    delta_kernel_.SetContext(std::move(ctx2));
+
+    std::unique_ptr<KernelContext> ctx3(new KernelContext);
+    ctx3->As<ARMContext>();
+    grad_kernel_.SetContext(std::move(ctx3));
+  }
+
+  void run_forward(param_t* param,
+                   kernel_t* kernel,
+                   const std::vector<float>& x_vec,
+                   const std::vector<float>& y_vec,
+                   float* out_vec) {
+    Tensor x;
+    Tensor y;
+    Tensor output;
+    x.Resize(x_dims_);
+    y.Resize(y_dims_);
+    output.Resize(DDim(out_dims_));
+    auto* x_data = x.mutable_data<float>();
+    auto* y_data = y.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_data[i] = x_vec[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_data[i] = y_vec[i];
+    }
+
+    param->x = &x;
+    param->y = &y;
+    param->output = &output;
+    param->x_num_col_dims = x_num_col_dims_;
+    param->y_num_col_dims = y_num_col_dims_;
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* output_data = output.mutable_data<float>();
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_vec[i] = output_data[i];
+    }
+  }
+
+  void run_backward(grad_param_t* param,
+                    grad_kernel_t* kernel,
+                    const std::vector<float>& x_vec,
+                    const std::vector<float>& y_vec,
+                    const std::vector<float>& out_grad_vec,
+                    float* x_grad_vec,
+                    float* y_grad_vec) {
+    Tensor x;
+    Tensor x_grad;
+    Tensor y;
+    Tensor y_grad;
+    Tensor out_grad;
+    x.Resize(x_dims_);
+    x_grad.Resize(x_dims_);
+    y.Resize(y_dims_);
+    y_grad.Resize(y_dims_);
+    out_grad.Resize(out_dims_);
+    auto* x_data = x.mutable_data<float>();
+    auto* y_data = y.mutable_data<float>();
+    auto* out_grad_data = out_grad.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_data[i] = x_vec[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_data[i] = y_vec[i];
+    }
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_grad_data[i] = out_grad_vec[i];
+    }
+
+    param->x = &x;
+    param->x_grad = &x_grad;
+    param->y = &y;
+    param->y_grad = &y_grad;
+    param->output_grad = &out_grad;
+    param->x_num_col_dims = x_num_col_dims_;
+    param->y_num_col_dims = y_num_col_dims_;
+    kernel->SetParam(*param);
+    kernel->Launch();
+
+    auto* x_grad_data = x_grad.mutable_data<float>();
+    auto* y_grad_data = y_grad.mutable_data<float>();
+    for (int i = 0; i < x_dims_.production(); i++) {
+      x_grad_vec[i] = x_grad_data[i];
+    }
+    for (int i = 0; i < y_dims_.production(); i++) {
+      y_grad_vec[i] = y_grad_data[i];
+    }
+  }
+
+  void check_grad() {
+    std::vector<int64_t> out_shape;
+    for (int i = 0; i < x_num_col_dims_; i++) {
+      out_shape.push_back(x_dims_[i]);
+    }
+    for (int i = y_num_col_dims_; i < y_dims_.size(); i++) {
+      out_shape.push_back(y_dims_[i]);
+    }
+    out_dims_ = DDim(out_shape);
+
+    // forward
+    std::vector<float> x(x_dims_.production());
+    std::vector<float> y(y_dims_.production());
+    std::vector<float> out(out_dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+    fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
+    this->run_forward(&param_, &kernel_, x, y, out.data());
+
+    // backward
+    std::vector<float> out_grad(out_dims_.production());
+    std::vector<float> x_grad(x_dims_.production());
+    std::vector<float> y_grad(y_dims_.production());
+    for (int i = 0; i < out_dims_.production(); i++) {
+      out_grad[i] = 1.0;
+    }
+    this->run_backward(&grad_param_,
+                       &grad_kernel_,
+                       x,
+                       y,
+                       out_grad,
+                       x_grad.data(),
+                       y_grad.data());
+
+    // get numeric gradient
+    std::vector<float> x_delta(x_dims_.production());
+    std::vector<float> y_delta(y_dims_.production());
+    std::vector<float> out_delta(out_dims_.production());
+
+    float delta = 0.001;
+    float max_grad_delta = 0.005;
+    for (int i = 0; i < x_dims_.production(); i++) {
+      for (int j = 0; j < x_dims_.production(); j++) {
+        if (i == j) {
+          x_delta[j] = x[j] + delta;
+        } else {
+          x_delta[j] = x[j];
+        }
+      }
+      this->run_forward(
+          &delta_param_, &delta_kernel_, x_delta, y, out_delta.data());
+
+      float sum = 0;
+      for (int j = 0; j < out_dims_.production(); j++) {
+        sum += (out_delta[j] - out[j]);
+      }
+
+      EXPECT_NEAR(x_grad[i], sum / delta, max_grad_delta);
+    }
+
+    for (int i = 0; i < y_dims_.production(); i++) {
+      for (int j = 0; j < y_dims_.production(); j++) {
+        y_delta[j] = i == j ? y[j] + delta : y[j];
+      }
+      this->run_forward(
+          &delta_param_, &delta_kernel_, x, y_delta, out_delta.data());
+      float sum = 0;
+      for (int j = 0; j < out_dims_.production(); j++) {
+        sum += out_delta[j] - out[j];
+      }
+
+      EXPECT_NEAR(y_grad[i], sum / delta, max_grad_delta);
+    }
+  }
+
+ private:
+  DDim x_dims_;
+  DDim y_dims_;
+  DDim out_dims_;
+  int x_num_col_dims_;
+  int y_num_col_dims_;
+  kernel_t kernel_;
+  kernel_t delta_kernel_;
+  grad_kernel_t grad_kernel_;
+  param_t param_;
+  param_t delta_param_;
+  grad_param_t grad_param_;
+};
+
+void TestNormalCase(const std::vector<int64_t>& x_dims,
+                    const std::vector<int64_t>& y_dims,
+                    int x_num_col_dims,
+                    int y_num_col_dims) {
+  std::unique_ptr<MulGradTester> tester(new MulGradTester(
+      DDim(x_dims), DDim(y_dims), x_num_col_dims, y_num_col_dims));
+
+  tester->prepare_kernel();
+
+  tester->check_grad();
+}
+
+TEST(mul_grad_arm, compute) {
+  LOG(INFO) << "Test Mul grad";
+  DeviceInfo::Init();
+  TestNormalCase({1, 3}, {3, 2}, 1, 1);
+  TestNormalCase({3, 2}, {2, 1}, 1, 1);
+  TestNormalCase({3, 1}, {1, 7}, 1, 1);
+  TestNormalCase({2, 3}, {3, 2}, 1, 1);
+  TestNormalCase({4, 5}, {5, 4}, 1, 1);
+  TestNormalCase({4, 5}, {5, 4, 3, 2}, 1, 1);
+  TestNormalCase({3, 4}, {2, 2, 3}, 1, 2);
+  TestNormalCase({4, 20}, {5, 4, 3, 2}, 1, 2);
+  TestNormalCase({4, 60}, {5, 4, 3, 2}, 1, 3);
+  TestNormalCase({2, 3, 4, 5}, {60, 4}, 1, 1);
+  TestNormalCase({2, 3, 4, 5}, {20, 4}, 2, 1);
+  TestNormalCase({2, 3, 4, 5}, {5, 4}, 3, 1);
+  TestNormalCase({2, 3, 4, 5}, {60, 3, 4, 5}, 1, 1);
+  TestNormalCase({2, 3, 4, 5}, {4, 5, 6, 2}, 2, 2);
+  TestNormalCase({2, 3, 4, 5}, {5, 1, 4, 2}, 3, 2);
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mul_grad, kARM, kFloat, kNCHW, def);
diff --git a/lite/tests/kernels/pool_compute_test.cc b/lite/tests/kernels/pool_compute_test.cc
index d94c2e5154b88e9455c1c3cf8d937d13e825a858..04894188b0bf1557000479ae18b0369997909f89 100644
--- a/lite/tests/kernels/pool_compute_test.cc
+++ b/lite/tests/kernels/pool_compute_test.cc
@@ -276,9 +276,24 @@ void TestPoolHelper(Place place,
                     std::string pooling_type,
                     std::vector<int> strides,
                     std::vector<int> paddings,
-                    std::vector<int> ksize) {
-  std::unique_ptr<arena::TestCase> tester(new PoolComputeTest(
-      place, "def", DDim(dims), pooling_type, false, strides, paddings, ksize));
+                    std::vector<int> ksize,
+                    bool exclusive = true,
+                    bool ceil_mode = false,
+                    bool adaptive = false,
+                    std::string padding_algorithm = "") {
+  std::unique_ptr<arena::TestCase> tester(
+      new PoolComputeTest(place,
+                          "def",
+                          DDim(dims),
+                          pooling_type,
+                          false,
+                          strides,
+                          paddings,
+                          ksize,
+                          exclusive,
+                          ceil_mode,
+                          adaptive,
+                          padding_algorithm));
   arena::Arena arena(std::move(tester), place, abs_error);
   arena.TestPrecision();
 }
@@ -345,6 +360,20 @@ void TestPoolKsize(Place place, float abs_error = 2e-5) {
   }
 }
 
+void TestPoolCeilMode(Place place, float abs_error = 2e-5) {
+  for (auto pooling_type : {"max", "avg"}) {
+    TestPoolHelper(place,
+                   abs_error,
+                   {2, 3, 6, 6},
+                   pooling_type,
+                   {2, 2},
+                   {0, 0, 0, 0},
+                   {3, 3},
+                   true,
+                   true);
+  }
+}
+
 TEST(Pool, precision) {
   LOG(INFO) << "test pool op";
   float abs_error = 2e-5;
@@ -352,6 +381,8 @@ TEST(Pool, precision) {
 #if defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
   abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
 #else
   return;
 #endif
@@ -361,6 +392,7 @@ TEST(Pool, precision) {
   TestPoolStrides(place, abs_error);
   TestPoolPaddings(place, abs_error);
   TestPoolKsize(place, abs_error);
+  TestPoolCeilMode(place, abs_error);
 }
 
 }  // namespace lite
diff --git a/lite/tests/kernels/read_from_array_compute_test.cc b/lite/tests/kernels/read_from_array_compute_test.cc
index bcba00fddd30c3ffe49fc90dd5fe500dcb8dc1a9..cd3596ff56ec37dc6d0dbe78a17d5e678222fbf4 100644
--- a/lite/tests/kernels/read_from_array_compute_test.cc
+++ b/lite/tests/kernels/read_from_array_compute_test.cc
@@ -13,11 +13,10 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
-#include <stdio.h>
-#include <stdlib.h>
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 
 namespace paddle {
 namespace lite {
@@ -25,80 +24,76 @@ namespace lite {
 class ReadFromArrayComputeTester : public arena::TestCase {
  protected:
   // common attributes for this op.
-  std::string input_0 = "in_0";
-  std::string input_1 = "in_1";
-  std::string input_2 = "in_2";
-  std::string input_i = "i";
-  std::string output = "out";
-  DDim dims_{{3, 5, 4, 4}};
-  int i_;
+  std::string x_ = "x";
+  std::string idn_ = "i";
+  std::string out_ = "out";
+  DDim tar_dims_{{3, 5, 4, 4}};
+  int x_size_ = 1;
+  int id_ = 0;
 
  public:
   ReadFromArrayComputeTester(const Place& place,
                              const std::string& alias,
-                             const int i,
-                             DDim dims)
-      : TestCase(place, alias), i_(i), dims_(dims) {}
+                             DDim tar_dims,
+                             int x_size = 1,
+                             int id = 0)
+      : TestCase(place, alias), tar_dims_(tar_dims), x_size_(x_size), id_(id) {}
 
   void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output);
-    CHECK(out);
-    auto* in_0 = scope->FindTensor(input_0);
-    auto* in_1 = scope->FindTensor(input_1);
-    auto* in_2 = scope->FindTensor(input_2);
-    auto* id_tensor = scope->FindTensor(input_i);
-    std::vector<const TensorLite*> in_vec = {in_0, in_1, in_2};
-    int cur_in_num = in_vec.size();
+    auto x = scope->FindVar(x_)->GetMutable<std::vector<Tensor>>();
+    auto idn = scope->FindTensor(idn_);
+    auto out = scope->NewTensor(out_);
 
-    int id = id_tensor->data<int>()[0];
-    out->Resize(dims_);
-    const auto* in_data = in_vec[id]->data<float>();
-    auto* o_data = out->mutable_data<float>();
-    int n = in_vec[id]->numel();
-    memcpy(o_data, in_data, sizeof(float) * n);
+    int id = idn->data<int64_t>()[0];
+    out->CopyDataFrom(x->at(id));
   }
 
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
     op_desc->SetType("read_from_array");
-    op_desc->SetInput("X", {input_0, input_1, input_2});
-    op_desc->SetInput("I", {input_i});
-    op_desc->SetOutput("Out", {output});
+    op_desc->SetInput("X", {x_});
+    op_desc->SetInput("I", {idn_});
+    op_desc->SetOutput("Out", {out_});
   }
 
   void PrepareData() override {
-    std::vector<std::string> in_vec = {input_0, input_1, input_2};
-    for (auto in : in_vec) {
-      std::vector<float> data(dims_.production());
-      for (int i = 0; i < dims_.production(); i++) {
-        data[i] = std::rand() * 1.0f / RAND_MAX;
-      }
-      SetCommonTensor(in, dims_, data.data());
+    std::vector<DDim> x_dims(x_size_);
+    std::vector<std::vector<float>> x_data(x_size_);
+    for (int i = 0; i < x_size_; i++) {
+      x_dims[i] = tar_dims_;
+      x_data[i].resize(x_dims[i].production());
+      fill_data_rand(x_data[i].data(), -1.f, 1.f, x_dims[i].production());
     }
+    SetCommonTensorList(x_, x_dims, x_data);
 
-    DDimLite dims_i{{1}};
-    int a = 1;
-    SetCommonTensor(input_i, dims_i, &a);
+    std::vector<int64_t> didn(1);
+    didn[0] = id_;
+    SetCommonTensor(idn_, DDim{{1}}, didn.data());
   }
 };
 
-void test_read_from_array(Place place) {
+void TestReadFromArray(Place place, float abs_error) {
   DDimLite dims{{3, 5, 4, 4}};
-  for (int i : {1, 2}) {
-    std::unique_ptr<arena::TestCase> tester(
-        new ReadFromArrayComputeTester(place, "def", i, dims));
-    arena::Arena arena(std::move(tester), place, 2e-5);
-    arena.TestPrecision();
+  for (int x_size : {1, 3}) {
+    for (int id : {0, 2}) {
+      if (x_size < id + 1) continue;
+      std::unique_ptr<arena::TestCase> tester(
+          new ReadFromArrayComputeTester(place, "def", dims, x_size, id));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
   }
 }
 
 TEST(ReadFromArray, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
+  Place place;
+  float abs_error = 1e-5;
 #ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_read_from_array(place);
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  TestReadFromArray(place, abs_error);
 }
 
 }  // namespace lite
diff --git a/lite/tests/kernels/reshape_compute_test.cc b/lite/tests/kernels/reshape_compute_test.cc
index b82c291a4167a0c704d72a1814e9544a467d057f..4fba28e2ab982b1f15e48c95dfa247b2ea56c1ae 100644
--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
@@ -45,7 +45,8 @@ class ReshapeComputeTester : public arena::TestCase {
       : TestCase(place, alias), dims_(dims) {
     if (is_shape_tensor_vct) {
       for (size_t i = 0; i < shape.size(); i++) {
-        shape_tensor_vct_.emplace_back(op_type_ + "/shape" + std::to_string(i));
+        shape_tensor_vct_.emplace_back(op_type_ + "/shape" +
+                                       paddle::lite::to_string(i));
       }
     } else if (is_shape_tensor) {
       shape_tensor_ = op_type_ + "/shape";
@@ -155,19 +156,7 @@ class ReshapeComputeTester : public arena::TestCase {
   }
 };
 
-TEST(Reshape, precision) {
-  LOG(INFO) << "test Reshape op";
-  float abs_error = 2e-5;
-  Place place;
-#if defined(LITE_WITH_NPU)
-  place = TARGET(kNPU);
-  abs_error = 1e-2;  // Using fp16 in NPU
-#elif defined(LITE_WITH_XPU)
-  place = TARGET(kXPU);
-#else
-  return;
-#endif
-
+void TestReshape4D(Place place, float abs_error) {
   DDim dims{{2, 3, 4, 5}};
   std::vector<std::vector<int>> shapes{{5, 4, 3, 2},
                                        {2, 3, 20},
@@ -177,9 +166,6 @@ TEST(Reshape, precision) {
                                        {0, 0, 20},
                                        {0, 0, -1}};
   for (auto shape : shapes) {
-#ifdef LITE_WITH_NPU
-    if (dims.size() > 4 || shape.size() > 4) continue;
-#endif
     std::unique_ptr<arena::TestCase> tester(
         new ReshapeComputeTester(place, "def", dims, shape));
     arena::Arena arena(std::move(tester), place, abs_error);
@@ -187,5 +173,47 @@ TEST(Reshape, precision) {
   }
 }
 
+void TestReshape3D(Place place, float abs_error) {
+  DDim dims{{2, 3, 20}};
+  std::vector<std::vector<int>> shapes{
+      {5, 4, 3, 2}, {2, 3, 20}, {2, 60}, {120}, {2, 3, -1}, {0, 60}, {0, -1}};
+  for (auto shape : shapes) {
+    std::unique_ptr<arena::TestCase> tester(
+        new ReshapeComputeTester(place, "def", dims, shape));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision({"xshape"});
+  }
+}
+
+void TestReshape2D(Place place, float abs_error) {
+  DDim dims{{6, 20}};
+  std::vector<std::vector<int>> shapes{
+      {5, 4, 3, 2}, {2, 3, 20}, {2, 60}, {120}, {-1}};
+  for (auto shape : shapes) {
+    std::unique_ptr<arena::TestCase> tester(
+        new ReshapeComputeTester(place, "def", dims, shape));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision({"xshape"});
+  }
+}
+
+TEST(Reshape, precision) {
+  LOG(INFO) << "test Reshape op";
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
+
+  TestReshape4D(place, abs_error);
+  TestReshape3D(place, abs_error);
+  TestReshape2D(place, abs_error);
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/tests/kernels/scale_compute_test.cc b/lite/tests/kernels/scale_compute_test.cc
index 1ededcd52d3fb4c8881a391dce5e7f22e87cdb44..efd0497002ee402426a7198bf47ec60c7f41d2fd 100644
--- a/lite/tests/kernels/scale_compute_test.cc
+++ b/lite/tests/kernels/scale_compute_test.cc
@@ -29,7 +29,8 @@ class ScaleComputeTester : public arena::TestCase {
   DDim x_dims_{{100, 20}};
   float scale_ = 0.;
   float bias_ = 0.;
-  bool bias_after_scale_;
+  bool bias_after_scale_ = true;
+  PrecisionType x_dtype_ = PRECISION(kFloat);
 
  public:
   ScaleComputeTester(const Place& place,
@@ -37,30 +38,45 @@ class ScaleComputeTester : public arena::TestCase {
                      const DDim& x_dims,
                      float scale,
                      float bias,
-                     bool bias_after_scale)
+                     bool bias_after_scale = true,
+                     PrecisionType x_dtype = PRECISION(kFloat))
       : TestCase(place, alias),
         x_dims_(x_dims),
         scale_(scale),
         bias_(bias),
-        bias_after_scale_(bias_after_scale) {}
+        bias_after_scale_(bias_after_scale),
+        x_dtype_(x_dtype) {}
 
-  void RunBaseline(Scope* scope) override {
+  template <typename T>
+  void RunBaselineHelper(Scope* scope) {
+    auto* x = scope->FindTensor(x_);
+    auto* x_data = x->data<T>();
     auto* out = scope->NewTensor(out_);
-    CHECK(out);
     out->Resize(x_dims_);
-    auto* out_data = out->mutable_data<float>();
-
-    auto* x = scope->FindTensor(x_);
-    const auto* x_data = x->data<float>();
-
-    float bias = bias_;
 
+    T scale = static_cast<T>(scale_);
+    T bias = static_cast<T>(bias_);
     if (!bias_after_scale_) {
-      bias *= scale_;
+      bias *= scale;
     }
 
+    auto out_data = out->mutable_data<T>();
     for (int i = 0; i < x_dims_.production(); i++) {
-      out_data[i] = x_data[i] * scale_ + bias;
+      out_data[i] = x_data[i] * scale + bias;
+    }
+  }
+
+  void RunBaseline(Scope* scope) override {
+    switch (x_dtype_) {
+      case PRECISION(kFloat):
+        RunBaselineHelper<float>(scope);
+        break;
+      case PRECISION(kInt32):
+        RunBaselineHelper<int>(scope);
+        break;
+      default:
+        LOG(FATAL) << "unsupported data type: " << PrecisionToStr(x_dtype_);
+        break;
     }
   }
 
@@ -73,13 +89,74 @@ class ScaleComputeTester : public arena::TestCase {
     op_desc->SetAttr("bias_after_scale", bias_after_scale_);
   }
 
+  template <typename T>
+  void PrepareDataHelper() {
+    std::vector<T> dx(x_dims_.production());
+    fill_data_rand<T>(dx.data(), -10, 10, x_dims_.production());
+    SetCommonTensor(x_, x_dims_, dx.data());
+  }
+
   void PrepareData() override {
-    std::vector<float> x(x_dims_.production());
-    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
-    SetCommonTensor(x_, x_dims_, x.data());
+    switch (x_dtype_) {
+      case PRECISION(kFloat):
+        PrepareDataHelper<float>();
+        break;
+      case PRECISION(kInt32):
+        PrepareDataHelper<int>();
+        break;
+      default:
+        LOG(FATAL) << "unsupported data type: " << PrecisionToStr(x_dtype_);
+        break;
+    }
   }
 };
 
+void TestScaleShape(Place place, float abs_error) {
+  for (auto x_dims :
+       std::vector<std::vector<int64_t>>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) {
+    std::unique_ptr<arena::TestCase> tester(
+        new ScaleComputeTester(place, "def", DDim(x_dims), 1.5f, 0.2f));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
+void TestScaleValue(Place place, float abs_error) {
+  for (float scale : {0.123, 0., -1.2}) {
+    for (float bias : {1., 0., -1.2331}) {
+      std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
+          place, "def", DDim({5, 2, 3, 4}), scale, bias));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestScaleOrder(Place place, float abs_error) {
+  for (bool bias_after_scale : {true, false}) {
+    std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
+        place, "def", DDim({2, 3, 4, 5}), 1.5f, 0.2f, bias_after_scale));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
+void TestScaleDtype(Place place, float abs_error) {
+  for (PrecisionType x_dtype : {PRECISION(kFloat), PRECISION(kInt32)}) {
+    if (x_dtype == PRECISION(kFloat)) {
+      place.precision = PRECISION(kFloat);
+    } else if (x_dtype == PRECISION(kInt32)) {
+      place.precision = PRECISION(kInt32);
+    } else {
+      LOG(FATAL) << "fatal";
+    }
+    std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
+        place, "def", DDim({2, 3, 4, 5}), 2.f, 1.f, true, x_dtype));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
 TEST(Scale, precision) {
   Place place;
   float abs_error = 2e-5;
@@ -97,19 +174,12 @@ TEST(Scale, precision) {
   return;
 #endif
 
-  for (auto x_dims :
-       std::vector<std::vector<int64_t>>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) {
-    for (float scale : {0.123, 2., -1.2}) {
-      for (float bias : {1., 0., -1.2331}) {
-        for (bool bias_after_scale : {true, false}) {
-          std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
-              place, "def", DDim(x_dims), scale, bias, bias_after_scale));
-          arena::Arena arena(std::move(tester), place, abs_error);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
+  TestScaleShape(place, abs_error);
+  TestScaleValue(place, abs_error);
+  TestScaleOrder(place, abs_error);
+#ifdef LITE_WITH_ARM
+  TestScaleDtype(place, abs_error);
+#endif
 }
 
 TEST(Scale, performance) {
diff --git a/lite/tests/kernels/sequence_conv_compute_test.cc b/lite/tests/kernels/sequence_conv_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..342e5664f33dba334c7bc934af09fc75b2435a85
--- /dev/null
+++ b/lite/tests/kernels/sequence_conv_compute_test.cc
@@ -0,0 +1,149 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <stdio.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+
+namespace paddle {
+namespace lite {
+
+class SequenceConvComputeTester : public arena::TestCase {
+ public:
+  SequenceConvComputeTester(const Place& place,
+                            const std::string& alias,
+                            LoD lod,
+                            DDim dims,
+                            const int& contextStart,
+                            const int& contextStride,
+                            const int& contextLength,
+                            const int& kernel_num)
+      : TestCase(place, alias),
+        lod_(lod),
+        dims_(dims),
+        contextStart_(contextStart),
+        contextStride_(contextStride),
+        contextLength_(contextLength),
+        kernel_num_(kernel_num) {}
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("sequence_conv");
+    op_desc->SetInput("X", {input_name_});
+    op_desc->SetInput("Filter", {filter_name_});
+    op_desc->SetOutput("Out", {output_name_});
+    op_desc->SetAttr("contextStart", contextStart_);
+    op_desc->SetAttr("contextStride", contextStride_);
+    op_desc->SetAttr("contextLength", contextLength_);
+  }
+
+  void PrepareData() override {
+    DDim filter_dims(
+        std::vector<int64_t>{contextLength_ * dims_[1], kernel_num_});
+
+    std::vector<float> din(dims_.production());
+    for (int i = 0; i < dims_[0]; i++) {
+      for (int j = 0; j < dims_[1]; j++) {
+        din[i * dims_[1] + j] =
+            (2.0 * i + 3.0 * j) / (2.0 * dims_[0] + 3.0 * dims_[1]) - 0.5;
+      }
+    }
+    SetCommonTensor(input_name_, dims_, din.data(), lod_);
+
+    std::vector<float> dfilter(filter_dims.production());
+    for (int i = 0; i < filter_dims[0]; i++) {
+      for (int j = 0; j < filter_dims[1]; j++) {
+        dfilter[i * filter_dims[1] + j] =
+            (1.5 * i + 2.0 * j) /
+                (1.5 * filter_dims[0] + 2.0 * filter_dims[1]) -
+            0.5;
+      }
+    }
+    SetCommonTensor(filter_name_, filter_dims, dfilter.data(), lod_);
+  }
+
+  void RunBaseline(Scope* scope) override {
+    // calculate res the output in this scope
+    // to compare with the Paddle-Lite calculated one
+
+    auto* output = scope->NewTensor(output_name_);
+    CHECK(output);
+    std::vector<int64_t> output_shape({4, 3});
+    output->Resize(DDim(output_shape));
+    auto output_dims = output->dims();
+    auto output_data = output->mutable_data<float>();
+    std::vector<std::vector<float>> res;
+    if (contextStart_ == -2) {
+      res = {{-0.08867277, -0.17257819, -0.2564836},
+             {0.194508, 0.05720823, -0.08009153},
+             {0.73512584, 0.5749428, 0.41475973},
+             {0.5635012, 0.49485126, 0.42620137}};
+    } else if (contextStart_ == -1) {
+      res = {{0.194508, 0.05720823, -0.08009153},
+             {0.73512584, 0.5749428, 0.41475973},
+             {0.5635012, 0.49485126, 0.42620137},
+             {0.2517162, 0.23646072, 0.22120519}};
+    } else if (contextStart_ == 0) {
+      res = {{0.73512584, 0.5749428, 0.41475973},
+             {0.5635012, 0.49485126, 0.42620137},
+             {0.2517162, 0.23646072, 0.22120519},
+             {0.02574372, 0.03337148, 0.04099924}};
+    } else {
+      fprintf(stderr, "not supported contextStart_\n");
+      exit(-1);
+    }
+    for (int i = 0; i < output_shape[0]; i++) {
+      for (int j = 0; j < output_shape[1]; j++) {
+        output_data[i * output_shape[1] + j] = res[i][j];
+      }
+    }
+    (output->mutable_lod())->push_back(lod_[0]);
+  }
+
+ protected:
+  std::string input_name_ = "x";
+  std::string filter_name_ = "filter";
+  std::string output_name_ = "out";
+  LoD lod_;
+  DDim dims_;
+  int contextStart_;
+  int contextStride_;
+  int contextLength_;
+  int kernel_num_;
+};
+
+void TestNormalCase(Place place, float abs_error = 2e-5) {
+  std::vector<std::vector<uint64_t>> lod{{0, 4}};
+  std::vector<int64_t> dims{4, 5};
+  std::vector<int> candidate_pad_idx{-2, -1, 0};
+  for (int pad_idx : candidate_pad_idx) {
+    std::unique_ptr<arena::TestCase> tester(new SequenceConvComputeTester(
+        place, "def", lod, DDim(dims), pad_idx, 1, 3, 3));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
+TEST(sequence_conv, precision) {
+#ifdef LITE_WITH_ARM
+  float abs_error = 2e-5;
+  Place place(TARGET(kARM));
+
+  TestNormalCase(place, abs_error);
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/sgd_compute_test.cc b/lite/tests/kernels/sgd_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..687494ed348be949556bf4aa381a972ddf0d3216
--- /dev/null
+++ b/lite/tests/kernels/sgd_compute_test.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class SGDComputeTester : public arena::TestCase {
+ protected:
+  std::string param_ = "param";
+  std::string param_out_ = "param_out";
+  std::string grad_ = "grad";
+  std::string lr_ = "learning_rate";
+  float learning_rate_ = 0.01;
+  DDim dims_{{2, 5}};
+
+ public:
+  SGDComputeTester(const Place& place,
+                   const std::string& alias,
+                   DDim dims,
+                   float learning_rate)
+      : TestCase(place, alias), dims_(dims), learning_rate_(learning_rate) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto param = scope->FindTensor(param_);
+    auto grad = scope->FindTensor(grad_);
+    auto lr = scope->FindTensor(lr_);
+    auto param_out = scope->NewTensor(param_out_);
+    CHECK(param_out);
+
+    auto param_data = param->data<float>();
+    auto grad_data = grad->data<float>();
+    auto lr_data = *lr->data<float>();
+
+    param_out->Resize(dims_);
+    auto param_out_data = param_out->mutable_data<float>();
+
+    for (int i = 0; i < dims_.production(); i++) {
+      param_out_data[i] = param_data[i] - lr_data * grad_data[i];
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("sgd");
+    op_desc->SetInput("Param", {param_});
+    op_desc->SetInput("Grad", {grad_});
+    op_desc->SetInput("LearningRate", {lr_});
+    op_desc->SetOutput("ParamOut", {param_out_});
+  }
+
+  void PrepareData() override {
+    std::vector<float> param_data(dims_.production());
+    fill_data_rand(param_data.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(param_, dims_, param_data.data());
+
+    std::vector<float> grad_data(dims_.production());
+    fill_data_rand(grad_data.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(grad_, dims_, grad_data.data());
+
+    std::vector<float> lr_data(1);
+    lr_data[0] = learning_rate_;
+    SetCommonTensor(lr_, DDim{{1}}, lr_data.data());
+  }
+};
+
+TEST(sgd, precision) {
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  std::vector<int64_t> dims{3, 2, 4, 1};
+  float lr = 0.01;
+  std::unique_ptr<arena::TestCase> tester(
+      new SGDComputeTester(place, "def", DDim(dims), lr));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/slice_compute_test.cc b/lite/tests/kernels/slice_compute_test.cc
index e8c63e2d729c931578de555cdf16cb066cd40e06..4d698ebc0d42a34cf07a85735c09bd49b3fb1284 100644
--- a/lite/tests/kernels/slice_compute_test.cc
+++ b/lite/tests/kernels/slice_compute_test.cc
@@ -168,8 +168,9 @@ class SliceComputeTester : public arena::TestCase {
       std::vector<std::string> ends_tensor_list_;
       for (int i = 0; i < starts_.size(); ++i) {
         starts_tensor_list_.push_back("starts_tensor_list_" +
-                                      std::to_string(i));
-        ends_tensor_list_.push_back("ends_tensor_list_" + std::to_string(i));
+                                      paddle::lite::to_string(i));
+        ends_tensor_list_.push_back("ends_tensor_list_" +
+                                    paddle::lite::to_string(i));
       }
       op_desc->SetInput("StartsTensorList", {starts_tensor_list_});
       op_desc->SetInput("EndsTensorList", {ends_tensor_list_});
@@ -203,15 +204,15 @@ class SliceComputeTester : public arena::TestCase {
     } else if (use_tensor_list_) {
       Scope& scope_ = this->scope();
       for (int i = 0; i < starts_.size(); ++i) {
-        auto* tensor =
-            scope_.NewTensor("starts_tensor_list_" + std::to_string(i));
+        auto* tensor = scope_.NewTensor("starts_tensor_list_" +
+                                        paddle::lite::to_string(i));
         tensor->Resize(DDim({1}));
         auto* d = tensor->mutable_data<int>();
         d[0] = starts_[i];
       }
       for (int i = 0; i < ends_.size(); ++i) {
         auto* tensor =
-            scope_.NewTensor("ends_tensor_list_" + std::to_string(i));
+            scope_.NewTensor("ends_tensor_list_" + paddle::lite::to_string(i));
         tensor->Resize(DDim({1}));
         auto* d = tensor->mutable_data<int>();
         d[0] = ends_[i];
diff --git a/lite/tests/kernels/topk_compute_test.cc b/lite/tests/kernels/topk_compute_test.cc
index 3c5540e48f3ba63508c511051265810bb9cf234b..699dd000fd49080e7b722754c6c515fb2b77a40c 100644
--- a/lite/tests/kernels/topk_compute_test.cc
+++ b/lite/tests/kernels/topk_compute_test.cc
@@ -16,102 +16,109 @@
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 
 namespace paddle {
 namespace lite {
-bool comp_func(std::pair<float, int> a, std::pair<float, int> b) {
+
+template <typename T1, typename T2>
+bool comp_func(std::pair<T1, T2> a, std::pair<T1, T2> b) {
   return (a.first > b.first);
 }
 
+template <typename T1, typename T2>
 class TopkComputeTester : public arena::TestCase {
  protected:
   // common attributes for this op.
-  std::string input_ = "x";
-  std::string out_val_ = "out_val";
-  std::string out_ind_ = "out_ind";
-  int K_ = 1;
-  DDim dims_{{3, 5, 4, 4}};
+  std::string x_ = "x";
+  std::string out_ = "out";
+  std::string indices_ = "indices";
+  DDim x_dims_{{3, 5, 4, 4}};
+  int k_ = 1;
 
  public:
   TopkComputeTester(const Place& place,
                     const std::string& alias,
-                    int K,
-                    DDim dims)
-      : TestCase(place, alias), K_(K), dims_(dims) {}
+                    DDim x_dims,
+                    int k = 1)
+      : TestCase(place, alias), x_dims_(x_dims), k_(k) {}
 
   void RunBaseline(Scope* scope) override {
-    auto* out_val = scope->NewTensor(out_val_);
-    auto* out_ind = scope->NewTensor(out_ind_);
-    CHECK(out_val);
-    CHECK(out_ind);
-    DDim out_dims = dims_;
-    out_dims[out_dims.size() - 1] = K_;
+    auto* out_val = scope->NewTensor(out_);
+    auto* out_ind = scope->NewTensor(indices_);
+    DDim out_dims = x_dims_;
+    out_dims[out_dims.size() - 1] = k_;
     out_val->Resize(out_dims);
     out_ind->Resize(out_dims);
-    auto* out_val_data = out_val->mutable_data<float>();
-    auto* out_ind_data = out_ind->mutable_data<int>();
+    auto* out_val_data = out_val->mutable_data<T1>();
+    auto* out_ind_data = out_ind->mutable_data<T2>();
 
-    auto* x = scope->FindTensor(input_);
-    const auto* x_data = x->data<float>();
-    int m = out_dims.production() / K_;
-    int n = dims_[dims_.size() - 1];
+    auto* x = scope->FindTensor(x_);
+    const auto* x_data = x->data<T1>();
+    int m = out_dims.production() / k_;
+    int n = x_dims_[x_dims_.size() - 1];
 
     for (int i = 0; i < m; i++) {
-      const float* in_tmp = x_data + i * n;
-      float* out_val_tmp = out_val_data + i * K_;
-      int* out_ind_tmp = out_ind_data + i * K_;
-      std::vector<std::pair<float, int>> vec;
+      const T1* in_tmp = x_data + i * n;
+      T1* out_val_tmp = out_val_data + i * k_;
+      T2* out_ind_tmp = out_ind_data + i * k_;
+      std::vector<std::pair<T1, T2>> vec;
       for (int j = 0; j < n; j++) {
-        vec.push_back(std::make_pair(in_tmp[j], j));
+        vec.push_back(std::make_pair(in_tmp[j], static_cast<T2>(j)));
       }
-      std::partial_sort(vec.begin(), vec.begin() + K_, vec.end(), comp_func);
-      for (int q = 0; q < K_; q++) {
+      std::partial_sort(
+          vec.begin(), vec.begin() + k_, vec.end(), comp_func<T1, T2>);
+      for (int q = 0; q < k_; q++) {
         out_val_tmp[q] = vec[q].first;
         out_ind_tmp[q] = vec[q].second;
-        LOG(INFO) << "out:" << i << " " << q << " " << out_val_tmp[q] << " "
-                  << out_ind_tmp[q];
       }
     }
   }
 
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("topk");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {out_val_, out_ind_});
-    op_desc->SetAttr("K", K_);
+    op_desc->SetType("top_k");
+    op_desc->SetInput("X", {x_});
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetOutput("Indices", {indices_});
+    op_desc->SetAttr("k", k_);
   }
 
   void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = std::rand() * 1.0f / RAND_MAX;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
+    std::vector<T1> dx(x_dims_.production());
+    fill_data_rand<T1>(dx.data(), -1, 1, x_dims_.production());
+    SetCommonTensor(x_, x_dims_, dx.data());
   }
 };
 
-void test_topk(Place place) {
-  DDimLite dims_0{{3, 5}};
-  DDimLite dims_1{{8}};
-  for (int K : {1, 2}) {
-    for (auto dims : {dims_0, dims_1}) {
+template <typename T1, typename T2>
+void test_topk(Place place, float abs_error) {
+  for (auto x_shape : std::vector<std::vector<int64_t>>{
+           {2, 3, 4, 5}, {3, 4, 5}, {4, 5}, {5}}) {
+    for (int k : {2, 5}) {
       std::unique_ptr<arena::TestCase> tester(
-          new TopkComputeTester(place, "def", K, dims));
-      arena::Arena arena(std::move(tester), place, 2e-5);
+          new TopkComputeTester<T1, T2>(place, "def", DDim(x_shape), k));
+      arena::Arena arena(std::move(tester), place, abs_error);
       arena.TestPrecision();
     }
   }
 }
 
 TEST(Topk, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_topk(place);
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-3;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
+
+#if defined(LITE_WITH_NPU)
+  test_topk<float, int>(place, abs_error);
+#else
+  test_topk<float, int64_t>(place, abs_error);
 #endif
 }
 
diff --git a/lite/tests/kernels/unsqueeze_compute_test.cc b/lite/tests/kernels/unsqueeze_compute_test.cc
index d8ec2b01f787f32a00d645725717b412ef8a953a..461ef7215e3ceb779b2522adbd5bb286036a0d8e 100644
--- a/lite/tests/kernels/unsqueeze_compute_test.cc
+++ b/lite/tests/kernels/unsqueeze_compute_test.cc
@@ -107,7 +107,6 @@ class UnsqueezeComputeTester : public arena::TestCase {
   }
 
   void PrepareData() override {
-    SetPrecisionType(out_, PRECISION(kFloat));
     std::vector<float> in_data(dims_.production());
     for (int i = 0; i < dims_.production(); ++i) {
       in_data[i] = i;
@@ -124,7 +123,7 @@ class UnsqueezeComputeTester : public arena::TestCase {
     } else if (input_axes_flag_ == 3) {
       std::string name = "axes_tensor_";
       for (size_t i = 0; i < axes_.size(); i++) {
-        name = name + std::to_string(i);
+        name = name + paddle::lite::to_string(i);
         axes_tensor_list_.push_back(name);
         SetCommonTensor(name, DDim({1}), &axes_[i]);
       }
@@ -214,7 +213,6 @@ class Unsqueeze2ComputeTester : public arena::TestCase {
   }
 
   void PrepareData() override {
-    SetPrecisionType(out_, PRECISION(kFloat));
     std::vector<float> in_data(dims_.production());
     for (int i = 0; i < dims_.production(); ++i) {
       in_data[i] = i;
diff --git a/lite/tests/kernels/write_to_array_compute_test.cc b/lite/tests/kernels/write_to_array_compute_test.cc
index 5eaabc9dd8925e8dcdaf210ddc9e2011aff6ddf8..233403171af2629f63983e9c5318d3e1f2a85a98 100644
--- a/lite/tests/kernels/write_to_array_compute_test.cc
+++ b/lite/tests/kernels/write_to_array_compute_test.cc
@@ -13,11 +13,10 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
-#include <stdio.h>
-#include <stdlib.h>
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 
 namespace paddle {
 namespace lite {
@@ -25,91 +24,73 @@ namespace lite {
 class WriteToArrayComputeTester : public arena::TestCase {
  protected:
   // common attributes for this op.
-  std::string input_0 = "x";
-  std::string input_1 = "i";
-  std::string output_0 = "out0";
-  std::string output_1 = "out1";
-  std::string output_2 = "out2";
-  DDim dims_{{3, 5, 4, 4}};
-  int i_;
+  std::string x_ = "x";
+  std::string idn_ = "i";
+  std::string out_ = "out";
+  DDim x_dims_{{3, 5, 4, 4}};
+  int out_size_ = 0;
+  int id_ = 0;
 
  public:
   WriteToArrayComputeTester(const Place& place,
                             const std::string& alias,
-                            const int i,
-                            DDim dims)
-      : TestCase(place, alias), i_(i), dims_(dims) {}
+                            DDim x_dims,
+                            int out_size = 0,
+                            int id = 0)
+      : TestCase(place, alias), x_dims_(x_dims), out_size_(out_size), id_(id) {}
 
   void RunBaseline(Scope* scope) override {
-    auto* out_0 = scope->NewTensor(output_0);
-    auto* out_1 = scope->NewTensor(output_1);
-    auto* out_2 = scope->NewTensor(output_2);
-    CHECK(out_0);
-    CHECK(out_1);
-    CHECK(out_2);
-    std::vector<TensorLite*> out_vec = {out_0, out_1, out_2};
+    auto out = scope->Var(out_)->GetMutable<std::vector<Tensor>>();
+    auto x = scope->FindTensor(x_);
 
-    auto* x = scope->FindTensor(input_0);
-    const auto* x_data = x->data<float>();
-    auto* id = scope->FindTensor(input_1);
-    const auto* id_data = id->data<float>();
-    int n = x->numel();
-    int cur_out_num = out_vec.size();
-    for (int i = cur_out_num; i < id_data[0] + 1; i++) {
-      char buffer[30];
-      snprintf(buffer, sizeof(buffer), "out%d", i);
-      auto out = scope->NewTensor(buffer);
-      out_vec.push_back(out);
+    if (out->size() < id_ + 1) {
+      out->resize(id_ + 1);
     }
-    out_vec[id_data[0]]->Resize(dims_);
-    auto* out_data = out_vec[id_data[0]]->mutable_data<float>();
-    memcpy(out_data, x_data, sizeof(float) * n);
+    out->at(id_).Resize(x->dims());
+    auto out_data = out->at(id_).mutable_data<float>();
+    memcpy(out_data, x->data<float>(), sizeof(float) * x->numel());
   }
 
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
     op_desc->SetType("write_to_array");
-    op_desc->SetInput("X", {input_0});
-    op_desc->SetInput("I", {input_1});
-    op_desc->SetOutput("Out", {output_0, output_1, output_2});
+    op_desc->SetInput("X", {x_});
+    op_desc->SetInput("I", {idn_});
+    op_desc->SetOutput("Out", {out_});
   }
 
   void PrepareData() override {
-    std::vector<float> data(dims_.production());
+    std::vector<float> dx(x_dims_.production());
+    fill_data_rand(dx.data(), -1.f, 1.f, x_dims_.production());
+    SetCommonTensor(x_, x_dims_, dx.data());
 
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(input_0, dims_, data.data());
-
-    std::vector<int> data_1(1);
-    data_1[0] = i_;
-    DDimLite dims_2{{1}};
-    SetCommonTensor(input_1, dims_2, data_1.data());
-
-    SetCommonTensor(output_0, dims_2, data_1.data());
-    SetCommonTensor(output_1, dims_2, data_1.data());
-    SetCommonTensor(output_2, dims_2, data_1.data());
+    std::vector<int64_t> didn(1);
+    didn[0] = id_;
+    SetCommonTensor(idn_, DDim{{1}}, didn.data());
   }
 };
-void test_write_to_array(Place place) {
+
+void TestWriteToArray(Place place, float abs_error) {
   DDimLite dims{{3, 5, 4, 4}};
-  for (int i : {1, 4}) {
-    std::unique_ptr<arena::TestCase> tester(
-        new WriteToArrayComputeTester(place, "def", i, dims));
-    arena::Arena arena(std::move(tester), place, 2e-5);
-    arena.TestPrecision();
+  for (int out_size : {0, 3}) {
+    for (int id : {0, 1, 4}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new WriteToArrayComputeTester(place, "def", dims, out_size, id));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
   }
 }
 
 TEST(WriteToArray, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
+  Place place;
+  float abs_error = 1e-5;
 #ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_write_to_array(place);
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  TestWriteToArray(place, abs_error);
 }
 
 }  // namespace lite
diff --git a/lite/tests/math/conv_compute_test.cc b/lite/tests/math/conv_compute_test.cc
index ceb35ffb6e4c728904d1f63f96f16434a561e904..8265f9db2f85e54dd91314ac5dc7932e7f7e842a 100644
--- a/lite/tests/math/conv_compute_test.cc
+++ b/lite/tests/math/conv_compute_test.cc
@@ -34,7 +34,7 @@ DEFINE_int32(power_mode,
 DEFINE_int32(threads, 1, "threads num");
 DEFINE_int32(warmup, 0, "warmup times");
 DEFINE_int32(repeats, 1, "repeats times");
-DEFINE_bool(basic_test, false, "do all tests");
+DEFINE_bool(basic_test, true, "do all tests");
 DEFINE_bool(check_result, true, "check the result");
 
 DEFINE_int32(batch, 1, "batch size");
@@ -307,7 +307,7 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
 #endif  // LITE_WITH_ARM
 
 // TODO(chenjiaoAngel): fix multi-threds, diff: 3x3 depthwise conv
-#if 1  // 3x3dw
+#if 0  // 3x3dw
 TEST(TestConv3x3DW, test_conv3x3_depthwise) {
   if (FLAGS_basic_test) {
     for (auto& stride : {1, 2}) {
@@ -325,6 +325,13 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
                         dims.push_back(DDim({batch, c, h, h}));
                       }
                     }
+#ifdef __aarch64__
+#else
+                    if (stride == 1 && (pad_bottom == 2 || pad_right == 2 ||
+                                        pad_top == 2 || pad_left == 2)) {
+                      continue;
+                    }
+#endif
                     const float leakey_relu_scale = 8.88;
                     test_conv_fp32(dims,
                                    weights_dim,
diff --git a/lite/tests/math/conv_int8_compute_test.cc b/lite/tests/math/conv_int8_compute_test.cc
index b53bbe780b722bd1686668a8ad0c2a1a98b1e8c8..02478a23f9634c96864429be73e7c4c22153e21f 100644
--- a/lite/tests/math/conv_int8_compute_test.cc
+++ b/lite/tests/math/conv_int8_compute_test.cc
@@ -34,7 +34,7 @@ DEFINE_int32(power_mode,
 DEFINE_int32(threads, 1, "threads num");
 DEFINE_int32(warmup, 0, "warmup times");
 DEFINE_int32(repeats, 1, "repeats times");
-DEFINE_bool(basic_test, false, "do all tests");
+DEFINE_bool(basic_test, true, "do all tests");
 DEFINE_bool(check_result, true, "check the result");
 
 DEFINE_int32(batch, 1, "batch size");
@@ -457,7 +457,7 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
                     const std::vector<int>& power_mode) {}
 #endif  // LITE_WITH_ARM
 
-#if 0   /// 3x3dw
+#if 1  /// 3x3dw
 TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
   if (FLAGS_basic_test) {
     for (auto& stride : {1, 2}) {
@@ -502,7 +502,7 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
               std::vector<DDim> dims;
               DDim weights_dim({c, 1, 5, 5});
               for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 3, 15, 33}) {
+                for (auto& h : {1, 3, 15, 33, 112, 224}) {
                   dims.push_back(DDim({batch, c, h, h}));
                 }
               }
@@ -514,7 +514,7 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
                              {1, 1},
                              flag_bias,
                              flag_relu,
-                             {4},
+                             {1, 4},
                              {FLAGS_power_mode});
             }
           }
@@ -525,7 +525,7 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
 }
 #endif  /// 5x5dw
 
-#if 0   /// conv1x1s1
+#if 1  /// conv1x1s1
 TEST(TestConv1x1s1Int8, test_conv1x1s1) {
   if (FLAGS_basic_test) {
     for (auto& cin : {1, 3, 8, 32}) {
@@ -562,7 +562,7 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) {
 }
 #endif  /// conv1x1s1
 
-#if 0   /// conv3x3s1
+#if 1  /// conv3x3s1
 TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
   if (FLAGS_basic_test) {
     for (auto& cin : {1, 3, 8, 33}) {
@@ -602,7 +602,7 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
 }
 #endif  /// conv3x3s1
 
-#if 0   /// conv3x3s2
+#if 1  /// conv3x3s2
 TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
   if (FLAGS_basic_test) {
     for (auto& cin : {1, 3, 31}) {
diff --git a/lite/tests/math/gemm_int8_compute_test.cc b/lite/tests/math/gemm_int8_compute_test.cc
index fde5aacb1c1c21810c06a51eb6fa1f0cc4c3307a..377b07b92cbaf36eafcf359c89a2ca3375708847 100644
--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
@@ -37,7 +37,7 @@ DEFINE_int32(power_mode,
 DEFINE_int32(threads, 1, "threads num");
 DEFINE_int32(warmup, 0, "warmup times");
 DEFINE_int32(repeats, 1, "repeats times");
-DEFINE_bool(basic_test, false, "do all tests");
+DEFINE_bool(basic_test, true, "do all tests");
 DEFINE_bool(check_result, true, "check the result");
 
 DEFINE_int32(M, 512, "gemm: M");
diff --git a/lite/tools/auto_transform.sh b/lite/tools/auto_transform.sh
new file mode 100644
index 0000000000000000000000000000000000000000..db37e13dfa5815e9429c835f006f413aa4f985e3
--- /dev/null
+++ b/lite/tools/auto_transform.sh
@@ -0,0 +1,166 @@
+#!/usr/bin/env bash
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+#set -u  # Check for undefined variables
+
+# Global variables
+###########################################
+# (1) x2paddle variables
+framework="caffe"        # framework=(caffe|tensorflow|onnx)
+prototxt=""
+weight=""
+model=""
+# fluid_save__dir: the path of x2paddlei's output result; this is used as `model_dir` of opt
+fluid_save_dir="saved_fluid"
+###########################################
+# (2)opt variables
+valid_targets="arm"       # valid_targets=(arm|opencl|x86|npu|xpu)
+optimize_out="lite_opt_dir"
+
+
+# check current system
+system=`uname -s`
+opt=""
+if [ ${system} == "Darwin" ]; then
+  opt=opt_mac
+else
+  opt=opt
+fi
+
+function check_x2paddle {
+  message=$(which x2paddle)
+  if [ ! $message ]; then 
+    echo "please install x2paddle environment first, you can install it according to https://github.com/PaddlePaddle/X2Paddle#%E7%8E%AF%E5%A2%83%E4%BE%9D%E8%B5%96"
+    exit 1
+  fi
+}
+function check_model_optimize_tool {
+  has_opt=$(find $opt)
+  if [ -z "$has_opt" ]; then
+    wget https://paddlelite-data.bj.bcebos.com/model_optimize_tool/$opt
+    chmod +x $opt
+  fi
+}
+function x2paddle_transform {
+  check_x2paddle
+  x2paddle 
+  if [ "$framework" == "caffe" ]; then
+    x2paddle --framework caffe \
+            --prototxt=$prototxt \
+      	    --weight=$weight \
+            --save_dir=$fluid_save_dir
+  elif [ "$framework" == "tensorflow" ]; then
+    x2paddle --framework=tensorflow \
+ 	     --model=$model \
+             --save_dir=$fluid_save_dir
+  elif [ "$framework" == "onnx" ]; then
+    x2paddle --framework=onnx \
+             --model=$model \
+             --save_dir=$fluid_save_dir
+  else
+    echo "error: unsupported framwork, x2paddle supports three framework: caffe、tensorflow and onnx."
+    exit 1
+  fi
+}
+
+function model_optimimize_tool_transform {
+     check_model_optimize_tool
+     ./$opt \
+       --model_dir=$fluid_save_dir/inference_model \
+       --optimize_out_type=naive_buffer \
+       --optimize_out=$optimize_out \
+       --valid_targets=$valid_targets 
+}
+
+function print_usage {
+    set +x
+    echo "\nUSAGE:"
+    echo "    auto_build.sh combines the function of x2paddle and opt, it can "
+    echo "    tranform model from tensorflow/caffe/onnx form into paddle-lite naive-buffer form."
+    echo "----------------------------------------"
+    echo "example:"
+    echo "    ./auto_build.sh --framework=tensorflow --model=tf_model.pb --optimize_out=opt_model_result"
+    echo "----------------------------------------"
+    echo  "Arguments about x2paddle:"
+    echo "    --framework=(tensorflow|caffe|onnx);"
+    echo "    --model='model file for tensorflow or onnx';"
+    echo "    --prototxt='proto file for caffe' --weight='weight file for caffe'"
+
+    echo "For TensorFlow:"
+    echo "   --framework=tensorflow --model=tf_model.pb"
+    echo
+    echo "For Caffe:"
+    echo "   --framework=caffe --prototxt=deploy.prototxt --weight=deploy.caffemodel"
+    echo
+    echo "For ONNX"
+    echo "   --framework=onnx --model=onnx_model.onnx"
+    echo
+    echo "Arguments about opt:"
+    echo "    --valid_targets=(arm|opencl|x86|npu|xpu); valid targets on Paddle-Lite."
+    echo "    --fluid_save_dir='path to outputed model after x2paddle'"
+    echo "    --optimize_out='path to outputed Paddle-Lite model'"
+    echo "----------------------------------------"
+    echo
+}
+
+function main {
+    # Parse command line.
+    if [ $# -eq 0 ] ; then
+       print_usage
+       exit 1
+    fi
+    for i in "$@"; do
+        case $i in
+            --framework=*)
+                framework="${i#*=}"
+                shift
+                ;;
+            --prototxt=*)
+                prototxt="${i#*=}"
+                shift
+                ;;
+            --weight=*)
+                weight="${i#*=}"
+                shift
+                ;;
+            --model=*)
+                model="${i#*=}"
+                shift
+                ;;
+            --fluid_save_dir=*)
+                fluid_save_dir="${i#*=}"
+                shift
+                ;;
+            --valid_targets=*)
+                valid_targets="${i#*=}"
+                shift
+                ;;
+            --optimize_out=*)
+                optimize_out="${i#*=}"
+                shift
+                ;;
+            *)
+                # unknown option
+                print_usage
+                exit 1
+                ;;
+        esac
+    done
+    x2paddle_transform
+    model_optimimize_tool_transform
+}
+
+main $@
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
index 7bb330b28bc51ca4a241831bd320cb25474a74cd..4c74f75c91c7931ffba060eb9180d858bc7d54a7 100755
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -14,13 +14,18 @@ readonly NUM_PROC=${LITE_BUILD_THREADS:-4}
 
 # global variables
 BUILD_EXTRA=OFF
-BUILD_JAVA=OFF
+BUILD_JAVA=ON
 BUILD_PYTHON=OFF
 BUILD_DIR=$(pwd)
 OPTMODEL_DIR=""
 BUILD_TAILOR=OFF
 BUILD_CV=OFF
 SHUTDOWN_LOG=ON
+BUILD_NPU=OFF
+NPU_DDK_ROOT="$(pwd)/ai_ddk_lib/" # Download HiAI DDK from https://developer.huawei.com/consumer/cn/hiai/
+BUILD_XPU=OFF
+XPU_SDK_ROOT="$(pwd)/xpu_sdk_lib/"
+LITE_WITH_ARM_LANG=OFF
 
 readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
 
@@ -37,18 +42,39 @@ fi
 function prepare_workspace {
     local root_dir=$1
     local build_dir=$2
+    # ARM LANG
+    if [ ${ARM_LANG} == "clang" ]; then
+        LITE_WITH_ARM_LANG=ON
+    else
+        LITE_WITH_ARM_LANG=OFF
+    fi
+    echo "ARM_LANG is  ${ARM_LANG}"
+    echo "LITE_WITH_ARM_LANG is ${LITE_WITH_ARM_LANG}"
     # in build directory
     # 1. Prepare gen_code file
     GEN_CODE_PATH_PREFIX=$build_dir/lite/gen_code
     mkdir -p ${GEN_CODE_PATH_PREFIX}
     touch ${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
-
     # 2.Prepare debug tool
     DEBUG_TOOL_PATH_PREFIX=$build_dir/lite/tools/debug
     mkdir -p ${DEBUG_TOOL_PATH_PREFIX}
     cp $root_dir/lite/tools/debug/analysis_tool.py ${DEBUG_TOOL_PATH_PREFIX}/
 }
 
+
+function prepare_opencl_source_code {
+    local root_dir=$1
+    local build_dir=$2
+    # in build directory
+    # Prepare opencl_kernels_source.cc file
+    GEN_CODE_PATH_OPENCL=$root_dir/lite/backends/opencl
+    rm -f GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc
+    OPENCL_KERNELS_PATH=$root_dir/lite/backends/opencl/cl_kernel
+    mkdir -p ${GEN_CODE_PATH_OPENCL}
+    touch $GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc
+    python $root_dir/lite/tools/cmake_tools/gen_opencl_code.py $OPENCL_KERNELS_PATH $GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc 
+}
+
 function prepare_thirdparty {
     if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
         rm -rf $workspace/third-party
@@ -93,7 +119,7 @@ function make_tiny_publish_so {
   if [ ${os} == "armlinux" ]; then
     BUILD_JAVA=OFF
   fi
-
+  
   cmake .. \
       ${PYTHON_FLAGS} \
       ${CMAKE_COMMON_OPTIONS} \
@@ -105,14 +131,60 @@ function make_tiny_publish_so {
       -DANDROID_STL_TYPE=$android_stl \
       -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
       -DLITE_WITH_CV=$BUILD_CV \
+      -DLITE_WITH_ARM_LANG=$LITE_WITH_ARM_LANG \
       -DLITE_BUILD_TAILOR=$BUILD_TAILOR \
       -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
+      -DLITE_WITH_NPU=$BUILD_NPU \
+      -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
+      -DLITE_WITH_XPU=$BUILD_XPU \
+      -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
       -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
 
   make publish_inference -j$NUM_PROC
   cd - > /dev/null
 }
 
+function make_opencl {
+  local os=$1
+  local abi=$2
+  local lang=$3
+  #git submodule update --init --recursive
+  prepare_thirdparty
+
+  root_dir=$(pwd)
+  build_dir=$root_dir/build.lite.${os}.${abi}.${lang}.opencl
+  if [ -d $build_directory ]
+  then
+  rm -rf $build_directory
+  fi
+  mkdir -p $build_dir
+  cd $build_dir
+  prepare_workspace $root_dir $build_dir
+  prepare_opencl_source_code $root_dir $build_dir
+  # $1: ARM_TARGET_OS in "android" , "armlinux"
+  # $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" ,"armv7hf"
+  # $3: ARM_TARGET_LANG in "gcc" "clang"
+  cmake .. \
+      -DLITE_WITH_OPENCL=ON \
+      -DWITH_GPU=OFF \
+      -DWITH_MKL=OFF \
+      -DWITH_LITE=ON \
+      -DLITE_WITH_CUDA=OFF \
+      -DLITE_WITH_X86=OFF \
+      -DLITE_WITH_ARM=ON \
+      -DWITH_ARM_DOTPROD=ON   \
+      -DLITE_ON_TINY_PUBLISH=ON \
+      -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
+      -DWITH_TESTING=OFF \
+      -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
+      -DLITE_SHUTDOWN_LOG=$SHUTDOWN_LOG \
+      -DLITE_WITH_CV=$BUILD_CV \
+      -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2 -DARM_TARGET_LANG=$3
+
+    make opencl_clhpp -j$NUM_PROC
+    make publish_inference -j$NUM_PROC
+}
+
 function make_full_publish_so {
   local os=$1
   local abi=$2
@@ -147,11 +219,16 @@ function make_full_publish_so {
       -DANDROID_STL_TYPE=$android_stl \
       -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
       -DLITE_WITH_CV=$BUILD_CV \
+      -DLITE_WITH_ARM_LANG=$LITE_WITH_ARM_LANG \
       -DLITE_BUILD_TAILOR=$BUILD_TAILOR \
       -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
+      -DLITE_WITH_NPU=$BUILD_NPU \
+      -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
+      -DLITE_WITH_XPU=$BUILD_XPU \
+      -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
       -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
 
-  make publish_inference -j4
+  make publish_inference -j$NUM_PROC
   cd - > /dev/null
 }
 
@@ -170,13 +247,18 @@ function make_all_tests {
   fi
   mkdir -p $build_directory
   cd $build_directory
-
+ 
   prepare_workspace $root_dir $build_directory
   cmake $root_dir \
       ${CMAKE_COMMON_OPTIONS} \
       -DWITH_TESTING=ON \
       -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
       -DLITE_WITH_CV=$BUILD_CV \
+      -DLITE_WITH_ARM_LANG=$LITE_WITH_ARM_LANG \
+      -DLITE_WITH_NPU=$BUILD_NPU \
+      -DNPU_DDK_ROOT=$NPU_DDK_ROOT \
+      -DLITE_WITH_XPU=$BUILD_XPU \
+      -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
       -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
 
   make lite_compile_deps -j$NUM_PROC
@@ -209,13 +291,15 @@ function make_ios {
             -DLITE_ON_TINY_PUBLISH=ON \
             -DLITE_WITH_OPENMP=OFF \
             -DWITH_ARM_DOTPROD=OFF \
+            -DLITE_BUILD_TAILOR=$BUILD_TAILOR \
+            -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
             -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
             -DARM_TARGET_ARCH_ABI=$abi \
             -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
             -DLITE_WITH_CV=$BUILD_CV \
             -DARM_TARGET_OS=$os
 
-    make -j4 publish_inference
+    make publish_inference -j$NUM_PROC
     cd -
 }
 
@@ -246,7 +330,7 @@ function make_cuda {
             -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
             -DLITE_BUILD_EXTRA=ON
  
-  make publish_inference -j4
+  make publish_inference -j$NUM_PROC
   cd -
 }
 
@@ -272,10 +356,15 @@ function make_x86 {
             -DWITH_LITE=ON \
             -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
             -DLITE_WITH_ARM=OFF \
+            -DLITE_WITH_PYTHON=$BUILD_PYTHON \
             -DWITH_GPU=OFF \
-            -DLITE_BUILD_EXTRA=ON
+            -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
+            -DLITE_BUILD_EXTRA=ON \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DLITE_WITH_XPU=$BUID_XPU \
+            -DXPU_SDK_ROOT=$XPU_SDK_ROOT
 
-  make publish_inference -j4
+  make publish_inference -j$NUM_PROC
   cd -
 }
 
@@ -337,13 +426,6 @@ function main {
                 ;;
             --arm_lang=*)
                 ARM_LANG="${i#*=}"
-                if [ ${ARM_LANG} == "clang" ]; then
-                     set +x
-                     echo
-                     echo -e "error: only support gcc now, clang will be supported in future."
-                     echo
-                     exit 1
-                fi
                 shift
                 ;;
             --android_stl=*)
@@ -354,6 +436,10 @@ function main {
                 BUILD_EXTRA="${i#*=}"
                 shift
                 ;;
+            --build_cv=*)
+                BUILD_CV="${i#*=}"
+                shift
+                ;;
             --build_python=*)
                 BUILD_PYTHON="${i#*=}"
                 shift
@@ -378,6 +464,22 @@ function main {
                 SHUTDOWN_LOG="${i#*=}"
                 shift
                 ;;
+            --build_npu=*)
+                BUILD_NPU="${i#*=}"
+                shift
+                ;;
+           --npu_ddk_root=*)
+                NPU_DDK_ROOT="${i#*=}"
+                shift
+                ;;
+            --build_xpu=*)
+                BUILD_XPU="${i#*=}"
+                shift
+                ;;
+            --xpu_sdk_root=*)
+                XPU_SDK_ROOT="${i#*=}"
+                shift
+                ;;
             tiny_publish)
                 make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL 
                 shift
@@ -398,6 +500,10 @@ function main {
                 build_opt
                 shift
                 ;;
+            opencl)
+                make_opencl $ARM_OS $ARM_ABI $ARM_LANG
+                shift
+                ;;
             cuda)
                 make_cuda
                 shift
diff --git a/lite/tools/build_bm.sh b/lite/tools/build_bm.sh
index f4cfee5ec6b9256d94377cc8814ad73f64ca0546..2c3a8406f7e7c52ecb0268d581e043a3070ba028 100755
--- a/lite/tools/build_bm.sh
+++ b/lite/tools/build_bm.sh
@@ -2,7 +2,7 @@
 set -ex
 
 # global variables with default value
-BM_SDK_ROOT="$(pwd)/../BM_SDK"     # BM SDK
+BM_SDK_ROOT="$(pwd)/third-party/bmlibs/bm_sc3_libs"     # BM SDK
 TARGET_NAME="BM1682"     # default target
 BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
 WITH_TESTING=ON                    # ON/OFF
@@ -23,7 +23,7 @@ readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
                                -DWITH_PYTHON=OFF \
                                -DLITE_WITH_ARM=OFF"
 
-readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THRLITE_BUILD_THREADSEADS:-1}
+readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1}
 
 readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
 readonly workspace=$(pwd)
@@ -39,6 +39,11 @@ function prepare_thirdparty {
     else
         git submodule update --init --recursive
     fi
+
+    # clone bmlibs
+    if [ ! -d ${workspace}/third-party/bmlibs ]; then
+        git clone https://github.com/AnBaolei1984/bmlibs.git ${workspace}/third-party/bmlibs
+    fi     
 }
 
 # for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
@@ -70,8 +75,8 @@ function build_bm {
         ${CMAKE_COMMON_OPTIONS} \
         -DWITH_GPU=OFF \
         -DWITH_MKLDNN=OFF \
-        -DLITE_WITH_X86=ON \
-        -DWITH_MKL=ON \
+        -DLITE_WITH_X86=OFF \
+        -DWITH_MKL=OFF \
         -DLITE_BUILD_EXTRA=ON \
         -DLITE_WITH_XPU=OFF \
         -DLITE_WITH_BM=ON \
@@ -92,10 +97,10 @@ function main {
                 TARGET_NAME="${i#*=}"
                 shift
                 ;;
-            --bm_sdk_root=*)
-                BM_SDK_ROOT="${i#*=}"
-                shift
-                ;;
+            #--bm_sdk_root=*)
+            #    BM_SDK_ROOT="${i#*=}"
+            #    shift
+            #    ;;
             bm)
                 build_bm
                 shift
diff --git a/lite/tools/build_mlu.sh b/lite/tools/build_mlu.sh
new file mode 100755
index 0000000000000000000000000000000000000000..1912efda5edc6e436cc84dbdf9919a99e1ed3279
--- /dev/null
+++ b/lite/tools/build_mlu.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+set -ex
+
+# global variables with default value
+NEUWARE_HOME="${NEUWARE_HOME}"    # XPU SDK
+TARGET_NAME="all"    # default target
+BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
+WITH_TESTING=OFF                     # ON/OFF
+
+function print_usage {
+    echo -e "\nUSAGE:"
+    echo
+    echo "----------------------------------------"
+    echo -e "--mlu_sdk_root=<mlu sdk directory>"
+    echo -e "--target_name=<target name>"
+    echo "----------------------------------------"
+    echo
+}
+
+# readonly variables with default value
+readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
+                               -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
+                               -DWITH_PYTHON=OFF \
+                               -DLITE_WITH_ARM=OFF"
+
+readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1}
+
+readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
+readonly workspace=$(pwd)
+
+function prepare_thirdparty {
+    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
+        rm -rf $workspace/third-party
+
+        if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
+            wget $THIRDPARTY_TAR
+        fi
+        tar xzf third-party-05b862.tar.gz
+    else
+        # git submodule update --init --recursive
+        echo "third-party is in ready"
+    fi
+}
+
+# for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
+# here we fake an empty file to make cmake works.
+function prepare_workspace {
+    # in build directory
+    # 1. Prepare gen_code file
+    GEN_CODE_PATH_PREFIX=lite/gen_code
+    mkdir -p ./${GEN_CODE_PATH_PREFIX}
+    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
+
+    # 2.Prepare debug tool
+    DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
+    mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
+    # cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
+
+    # clone submodule
+    # git submodule update --init --recursive
+    prepare_thirdparty
+}
+
+function build_mlu {
+    build_dir=${workspace}/build.lite.mlu
+    mkdir -p $build_dir
+    cd $build_dir
+
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
+    prepare_workspace
+    cmake .. \
+        ${CMAKE_COMMON_OPTIONS} \
+        -DWITH_GPU=OFF \
+        -DWITH_MKLDNN=OFF \
+        -DLITE_WITH_X86=ON \
+        -DWITH_MKL=ON \
+        -DLITE_WITH_MLU=ON \
+        -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
+        -DWITH_TESTING=${WITH_TESTING} \
+        -DMLU_SDK_ROOT=${XPU_SDK_ROOT}
+
+    make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE
+
+    cd -
+    echo "Done"
+}
+
+function main {
+    # Parse command line.
+    for i in "$@"; do
+        case $i in
+            --target_name=*)
+                TARGET_NAME="${i#*=}"
+                shift
+                ;;
+            --build_extra=*)
+                BUILD_EXTRA="${i#*=}"
+                shift
+                ;;
+            --neuware_home=*)
+                NEUWARE_HOME="${i#*=}"
+                shift
+                ;;
+            build)
+                build_mlu
+                shift
+                ;;
+            full_publish)
+                TARGET_NAME=publish_inference
+                build_mlu
+                shift
+                ;;
+            *)
+                # unknown option
+                print_usage
+                exit 1
+                ;;
+        esac
+    done
+}
+
+main $@
diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh
index 1960dc1e1506f9742cdd9be41d5448c646c026af..703da69fa59f3aa99bad9fb04c0decb591486058 100755
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -37,6 +37,19 @@ function prepare_thirdparty {
     fi
 }
 
+function prepare_opencl_source_code {
+    local root_dir=$1
+    local build_dir=$2
+    # in build directory
+    # Prepare opencl_kernels_source.cc file
+    GEN_CODE_PATH_OPENCL=$root_dir/lite/backends/opencl
+    rm -f GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc
+    OPENCL_KERNELS_PATH=$root_dir/lite/backends/opencl/cl_kernel
+    mkdir -p ${GEN_CODE_PATH_OPENCL}
+    touch $GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc
+    python $root_dir/lite/tools/cmake_tools/gen_opencl_code.py $OPENCL_KERNELS_PATH $GEN_CODE_PATH_OPENCL/opencl_kernels_source.cc
+}
+
 # prepare adb devices
 # if USE_ADB_EMULATOR=ON , we create adb emulator port_armv8 and port_armv7 for usage, else we will use actual mobilephone according to adbindex.
 function prepare_adb_devices {
@@ -105,6 +118,8 @@ function cmake_opencl {
         -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
         -DWITH_TESTING=ON \
         -DLITE_BUILD_EXTRA=ON \
+        -DLITE_SHUTDOWN_LOG=OFF \
+        -DLITE_WITH_CV=OFF \
         -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2 -DARM_TARGET_LANG=$3
 }
 
@@ -169,18 +184,19 @@ function build_opencl {
         return 0
     fi
 
-    build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}.opencl
+    build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
     mkdir -p $build_dir
     cd $build_dir
 
+    prepare_opencl_source_code $cur_dir $build_dir
+
     cmake_opencl ${os} ${abi} ${lang}
-    make opencl_clhpp
+    make opencl_clhpp -j$NUM_CORES_FOR_COMPILE
     build $TESTS_FILE
-
-    # test publish inference lib
-    make publish_inference
 }
 
+
+
 # This method is only called in CI.
 function cmake_x86_for_CI {
     prepare_workspace # fake an empty __generated_code__.cc to pass cmake.
@@ -197,8 +213,7 @@ function cmake_x86_for_CI {
 
 function cmake_cuda_for_CI {
     prepare_workspace # fake an empty __generated_code__.cc to pass cmake.
-    cmake ..  -DLITE_WITH_CUDA=ON -DWITH_MKLDNN=OFF -DLITE_WITH_X86=OFF ${common_flags} -DLITE_WITH_PROFILE=ON -DWITH_MKL=OFF \
-        -DLITE_BUILD_EXTRA=ON -DCUDNN_ROOT=${CUDNN_ROOT}
+    cmake ..  -DLITE_WITH_CUDA=ON -DWITH_MKLDNN=OFF -DLITE_WITH_X86=OFF ${common_flags} -DLITE_WITH_PROFILE=OFF -DWITH_MKL=OFF -DLITE_BUILD_EXTRA=ON -DCUDNN_ROOT=${CUDNN_ROOT} -DWITH_LITE=OFF
 }
 
 function cmake_gpu {
@@ -272,7 +287,9 @@ function build_test_cuda_server {
     cd ./build
     export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
     cmake_cuda_for_CI
-    build
+    make -j$NUM_CORES_FOR_COMPILE
+    # temporary remove cuda unittest because the ci PR_CI_Paddle-Lite-server-cuda10.1(ubt16-gcc5.4) is in cpu machine and only build.
+    # ctest -R "/*_cuda_test" -V
 }
 
 function build_test_train {
@@ -369,7 +386,7 @@ function test_arm_android {
     echo "test name: ${test_name}"
     adb_work_dir="/data/local/tmp"
 
-    skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass")
+    skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass" "test_grid_sampler_image_opencl" "test_lrn_image_opencl" "test_pad2d_image_opencl")
     for skip_name in ${skip_list[@]} ; do
         [[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return
     done
@@ -578,6 +595,7 @@ function cmake_arm {
         -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
         -DWITH_TESTING=ON \
         -DLITE_BUILD_EXTRA=ON \
+        -DLITE_WITH_TRAIN=ON \
         -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2 -DARM_TARGET_LANG=$3
 }
 
@@ -644,7 +662,7 @@ function build_ios {
             -DLITE_WITH_CV=$BUILD_CV \
             -DARM_TARGET_OS=$os
 
-    make -j4 publish_inference
+    make publish_inference -j$NUM_PROC
     cd -
 }
 
@@ -736,16 +754,58 @@ function arm_push_necessary_file {
     adb -s ${device} push ${testpath} ${adb_work_dir}
 }
 
+
+function test_opencl {
+    os=$1
+    abi=$2
+    lang=$3
+    device=$4
+
+    if [[ ${os} == "armlinux" ]]; then
+        # TODO(hongming): enable test armlinux on armv8, armv7 and armv7hf
+        echo "Skip test arm linux yet. armlinux must in another docker"
+        return 0
+    fi
+
+    if [[ ${os} == "android" && ${abi} == "armv7hf" ]]; then
+        echo "android do not need armv7hf"
+        return 0
+    fi
+
+    # prepare for CXXApi test
+    local adb="adb -s ${device}"
+    $adb shell mkdir -p /data/local/tmp/lite_naive_model_opt
+
+    # opencl test should be marked with `opencl`
+    opencl_test_mark="opencl"
+
+    for _test in $(cat $TESTS_FILE); do
+        # tell if this test is marked with `opencl`
+        if [[ $_test == *$opencl_test_mark* ]]; then
+            test_arm_android $_test $device
+        fi
+    done
+
+}
+
 function build_test_arm_opencl {
     ########################################################################
     cur=$PWD
+    # job 1-4 must be in one runner
+    prepare_adb_devices
 
     # job 1
     build_opencl "android" "armv8" "gcc"
+    adb -s $device_armv8 shell 'rm -rf /data/local/tmp/*'
+    run_gen_code_test ${device_armv8}
+    test_opencl "android" "armv8" "gcc" ${device_armv8}
     cd $cur
 
     # job 2
     build_opencl "android" "armv7" "gcc"
+    adb -s $device_armv7 shell 'rm -rf /data/local/tmp/*'
+    run_gen_code_test ${device_armv7}
+    test_opencl "android" "armv7" "gcc" ${device_armv7}
     cd $cur
 
     echo "Done"
@@ -1080,6 +1140,8 @@ function main {
                 ;;
             build_test_arm_opencl)
                 build_test_arm_opencl
+                build_test_arm_subtask_model test_mobilenetv1 mobilenet_v1
+                build_test_arm_subtask_model test_mobilenetv2 mobilenet_v2_relu
                 shift
                 ;;
             build_test_arm_subtask_android)
diff --git a/lite/tools/cmake_tools/create_fake_kernel_registry.py b/lite/tools/cmake_tools/create_fake_kernel_registry.py
index 35012d5b163aac2b6998790b4cfcf31e16cb1454..0b96652c6f78ee6bcf5498b9247f0a2391c70473 100644
--- a/lite/tools/cmake_tools/create_fake_kernel_registry.py
+++ b/lite/tools/cmake_tools/create_fake_kernel_registry.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# this module will record kernels in unvalid_places into all_kernel_faked.cc
 
 from __future__ import print_function
 import sys
@@ -18,12 +19,13 @@ import logging
 from ast import RegisterLiteKernelParser
 from utils import *
 
-if len(sys.argv) != 4:
+if len(sys.argv) != 5:
     print("Error: create_fake_kernel_registry.py requires three inputs!")
     exit(1)
-ops_list_path = sys.argv[1]
-dest_path = sys.argv[2]
-kernelmap_path = sys.argv[3]
+kernels_list_path = sys.argv[1]
+faked_kernels_list_path = sys.argv[2]
+dest_path = sys.argv[3]
+kernelmap_path = sys.argv[4]
 
 out_lines = [
     '#pragma once',
@@ -77,68 +79,85 @@ const std::map<std::string, std::string> kernel2path_map{
 '''
 ]
 
+def parse_fake_kernels_from_path(list_path):
+    with open(list_path) as f:
+        paths = set([path for path in f])
+        for path in paths:
+            print('path', path)
+            with open(path.strip()) as g:
+                c = g.read()
+                kernel_parser = RegisterLiteKernelParser(c)
+                kernel_parser.parse()
+
+                for k in kernel_parser.kernels:
+                    kernel_name = "{op_type}_{target}_{precision}_{data_layout}_{alias}_class".format(
+                        op_type=k.op_type,
+                        target=k.target,
+                        precision=k.precision,
+                        data_layout=k.data_layout,
+                        alias=k.alias
+                    )
+
+                    kernel_define = fake_kernel % (
+                        kernel_name,
+                        k.target,
+                        k.precision,
+                        k.data_layout,
+                        kernel_name
+                    )
+
+                    out_lines.append(kernel_define)
+                    out_lines.append("")
+
+
+                    key = "REGISTER_LITE_KERNEL(%s, %s, %s, %s, %s, %s)" % (
+                        k.op_type,
+                        k.target,
+                        k.precision,
+                        k.data_layout,
+                        '::paddle::lite::' + kernel_name,
+                        k.alias
+                    )
+                    out_lines.append(key)
+
+                    for input in k.inputs:
+                        io = '    .BindInput("%s", {%s})' % (input.name, input.type)
+                        out_lines.append(io)
+                    for output in k.outputs:
+                        io = '    .BindOutput("%s", {%s})' % (output.name, output.type)
+                        out_lines.append(io)
+                    out_lines.append("    .Finalize();")
+                    out_lines.append("")
+                    out_lines.append(gen_use_kernel_statement(k.op_type, k.target, k.precision, k.data_layout, k.alias))
+
+def parse_sppported_kernels_from_path(list_path):
+    with open(list_path) as f:
+        paths = set([path for path in f])
+        for path in paths:
+            print('path', path)
+            with open(path.strip()) as g:
+                c = g.read()
+                kernel_parser = RegisterLiteKernelParser(c)
+                kernel_parser.parse()
+
+                for k in kernel_parser.kernels:
+                    index = path.rindex('/')
+                    filename = path[index + 1:]
+                    map_element = '  {"%s,%s,%s,%s,%s", "%s"},' % (
+                        k.op_type,
+                        k.target,
+                        k.precision,
+                        k.data_layout,
+                        k.alias,
+                        filename.strip()
+                    )
+                    kernel_src_map_lines.append(map_element)
+
+
+parse_fake_kernels_from_path(faked_kernels_list_path)
+parse_sppported_kernels_from_path(faked_kernels_list_path)
+parse_sppported_kernels_from_path(kernels_list_path)
 
-with open(ops_list_path) as f:
-    paths = set([path for path in f])
-    for path in paths:
-        print('path', path)
-        with open(path.strip()) as g:
-            c = g.read()
-            kernel_parser = RegisterLiteKernelParser(c)
-            kernel_parser.parse()
-
-            for k in kernel_parser.kernels:
-                kernel_name = "{op_type}_{target}_{precision}_{data_layout}_{alias}_class".format(
-                    op_type = k.op_type,
-                    target = k.target,
-                    precision = k.precision,
-                    data_layout = k.data_layout,
-                    alias = k.alias,
-                )
-
-                kernel_define = fake_kernel % (
-                    kernel_name,
-                    k.target,
-                    k.precision,
-                    k.data_layout,
-                    kernel_name,
-                )
-
-                out_lines.append(kernel_define)
-                out_lines.append("")
-
-
-                key = "REGISTER_LITE_KERNEL(%s, %s, %s, %s, %s, %s)" % (
-                    k.op_type,
-                    k.target,
-                    k.precision,
-                    k.data_layout,
-                    '::paddle::lite::' + kernel_name,
-                    k.alias,
-                )
-                out_lines.append(key)
-
-                for input in k.inputs:
-                    io = '    .BindInput("%s", {%s})' % (input.name, input.type)
-                    out_lines.append(io)
-                for output in k.outputs:
-                    io = '    .BindOutput("%s", {%s})' % (output.name, output.type)
-                    out_lines.append(io)
-                out_lines.append("    .Finalize();")
-                out_lines.append("")
-                out_lines.append(gen_use_kernel_statement(k.op_type, k.target, k.precision, k.data_layout, k.alias))
-
-                index = path.rindex('/')
-                filename = path[index + 1:]
-                map_element = '  {"%s,%s,%s,%s,%s", "%s"},' % (
-                    k.op_type,
-                    k.target,
-                    k.precision,
-                    k.data_layout,
-                    k.alias,
-                    filename.strip()
-                )
-                kernel_src_map_lines.append(map_element)
 with open(dest_path, 'w') as f:
     logging.info("write kernel list to %s" % dest_path)
     f.write('\n'.join(out_lines))
diff --git a/lite/tools/cmake_tools/gen_opencl_code.py b/lite/tools/cmake_tools/gen_opencl_code.py
new file mode 100644
index 0000000000000000000000000000000000000000..4348f6d65b12f642db35a01df10f9c24a7d7ff04
--- /dev/null
+++ b/lite/tools/cmake_tools/gen_opencl_code.py
@@ -0,0 +1,157 @@
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import re
+import os
+import sys
+import logging
+
+opencl_kernel_path=""
+opencl_dest_path=""
+
+def gen_opencl_kernels():
+    source = """
+#pragma
+#ifdef LITE_WITH_OPENCL
+#include <map>
+#include <string>
+#include <vector>
+namespace paddle {
+namespace lite {
+    // file name => source
+    extern const std::map<std::string, std::vector<unsigned char>> opencl_kernels_files = {
+    %s
+    };
+} // namespace lite
+} // namespace paddle
+#endif
+    """
+
+
+    def clean_source(content):
+        new_content = re.sub(r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/", "", content, flags=re.DOTALL)
+        lines = new_content.split("\n")
+        new_lines = []
+        for i in range(len(lines)):
+            line = lines[i]
+            line = re.sub(r"//.*$", "", line)
+            line = line.strip()
+            if line == "":
+                continue
+            new_lines.append(line)
+        new_content = "\n".join(new_lines)
+        return new_content
+
+    infile = open(opencl_kernel_path+"/cl_common.h", "r")
+    common_content = infile.read()
+    infile.close()
+    common_content = clean_source(common_content)
+
+    def get_header_raw(content):
+        lines = content.split("\n")
+        new_lines = []
+        for line in lines:
+            if "__kernel void" in line:
+                break
+            new_lines.append(line)
+        header = "\n".join(new_lines)
+        return header
+    common_header = get_header_raw(common_content)
+
+    def get_header(content):
+        lines = content.split("\n")
+        new_lines = []
+        for line in lines:
+            if "__kernel" in line:
+                break
+            new_lines.append(line)
+        for i in range(len(lines)):
+            if "#include \"cl_common.h\"" in lines[i] or "#include <cl_common.h>" in lines[i]:
+                lines[i] = common_header
+        header = "\n".join(lines)
+        return header
+
+
+    filenames = os.listdir(opencl_kernel_path+"/buffer")
+    file_count = len(filenames)
+
+    headers = {}
+    funcs = {}
+    for i in range(file_count):
+        filename = filenames[i]
+        infile = open(opencl_kernel_path+"/buffer/" + filename, "r")
+        content = infile.read()
+        infile.close()
+        content = clean_source(content)
+        header = get_header(content)
+        headers["buffer/" + filename] = header
+
+
+    image_filenames = os.listdir(opencl_kernel_path+"/image")
+    image_file_count = len(image_filenames)
+
+    for i in range(image_file_count):
+        filename = image_filenames[i]
+        infile = open(opencl_kernel_path+"/image/" + filename, "r")
+        content = infile.read()
+        infile.close()
+        content = clean_source(content)
+        header = get_header(content)
+        headers["image/" + filename] = header
+
+
+
+
+    core1 = ""
+    for i in range(len(headers)):
+        file_name = list(headers.keys())[i]
+        content = headers[file_name]
+        if content == "":
+            content = " "
+        hexes = []
+        for char in content:
+            hexes.append(hex(ord(char)))
+        core = "        {\"%s\", {" % file_name
+        for item in hexes:
+            core += str(item) + ", "
+        core = core[: -2]
+        core += "}}"
+        if i != len(headers) - 1:
+            core += ",\n"
+        core1 += core
+    source = source % (core1)
+    with open(opencl_dest_path, 'w') as f:
+        logging.info("write opencl kernels source files to %s" % opencl_dest_path)
+        f.write(source)
+
+def gen_empty_opencl_kernels():
+    source = """
+    #pragma once
+    #ifdef PADDLE_MOBILE_CL
+    #include <map>
+    #include <string>
+    #include <vector>
+    namespace paddle_mobile {
+        // func name => source
+        extern const std::map<std::string, std::vector<unsigned char>> opencl_kernels = {
+        };
+    }
+    #endif
+    """
+
+
+if __name__ == "__main__":
+    opencl_kernel_path = sys.argv[1]
+    opencl_dest_path = sys.argv[2]
+    gen_opencl_kernels()
diff --git a/lite/tools/cmake_tools/parse_op_registry.py b/lite/tools/cmake_tools/parse_op_registry.py
index 7eb3337ed87b708102b2032de9a279fcae2d321c..44ee09c28ff70ada782b9393f4fc0d5c07943b2c 100644
--- a/lite/tools/cmake_tools/parse_op_registry.py
+++ b/lite/tools/cmake_tools/parse_op_registry.py
@@ -45,8 +45,6 @@ for path in paths:
     op_parser = RegisterLiteOpParser(str_info)
     ops = op_parser.parse()
     for op in ops:
-        if "_grad" in op: 
-            continue
         if tailored == "ON":
             if op not in minlines: continue
         out = "USE_LITE_OP(%s);" % op
diff --git a/lite/tools/cmake_tools/record_supported_kernel_op.py b/lite/tools/cmake_tools/record_supported_kernel_op.py
index f6a3af6bd3e5a2decfb6b3b65b0357bff8b4a378..560174bc632bec89b9655ff89fd5eeb9e7db7786 100644
--- a/lite/tools/cmake_tools/record_supported_kernel_op.py
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# this module will record supported ops from kernels_src.txt
 
 from __future__ import print_function
 import sys
@@ -18,12 +19,13 @@ import logging
 from ast import RegisterLiteKernelParser
 from ast import RegisterLiteOpParser
 
-if len(sys.argv) != 4:
-    print("Error: record_supported_kernel_op.py requires three inputs!")
-    exit(1)
+if len(sys.argv) != 5:
+    print("Error: record_supported_kernel_op.py requires four inputs!")
+    sys.exit(1)
 kernels_list_path = sys.argv[1]
-ops_list_path = sys.argv[2]
-kernel_op_map_dest_path = sys.argv[3]
+faked_kernels_list_path = sys.argv[2]
+ops_list_path = sys.argv[3]
+kernel_op_map_dest_path = sys.argv[4]
 
 
 out_lines = [
@@ -51,11 +53,11 @@ const std::vector<std::vector<std::string>> supported_ops_target = {
 '''
 ]
 
-ops_lines=[]
+ops_lines = []
 
 # valid targets and valid_ops
 valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU"]
-valid_ops = [[],[],[],[],[],[],[],[],[],[]]
+valid_ops = [[], [], [], [], [], [], [], [], [], []]
 class TargetType:
     kUnk = 0
     kHost = 1
@@ -78,8 +80,21 @@ with open(kernels_list_path) as f:
             kernel_parser.parse()
             for k in kernel_parser.kernels:
                 if hasattr(TargetType, k.target):
-                    index=getattr(TargetType, k.target)
+                    index = getattr(TargetType, k.target)
                     valid_ops[index].append(k.op_type)
+# record op_info of valid kernels into `valid_ops` according to different target type
+with open(faked_kernels_list_path) as f:
+    paths = set([path for path in f])
+    for path in paths:
+        with open(path.strip()) as g:
+            c = g.read()
+            kernel_parser = RegisterLiteKernelParser(c)
+            kernel_parser.parse()
+            for k in kernel_parser.kernels:
+                if hasattr(TargetType, k.target):
+                    index = getattr(TargetType, k.target)
+                    valid_ops[index].append(k.op_type)
+
 
 # clear the repeated ops
 for target in valid_targets:
@@ -114,7 +129,7 @@ with open(kernel_op_map_dest_path, 'w') as f:
     f.write('\n'.join(out_lines))
     # write kernels into head file
     for target in valid_targets:
-        if len(valid_ops[getattr(TargetType, target)]) == 0 :
+        if len(valid_ops[getattr(TargetType, target)]) == 0:
             f.write("\n    // %s_OPS: " %target)
             f.write('\n    {},')
         else:
diff --git a/lite/utils/cv/CMakeLists.txt b/lite/utils/cv/CMakeLists.txt
index 6c88e70de125b650bcf576fd686373c59e37454c..f07350a4720d7f7eaa268fcaaddf8de31357725d 100644
--- a/lite/utils/cv/CMakeLists.txt
+++ b/lite/utils/cv/CMakeLists.txt
@@ -1,6 +1,7 @@
-if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
+if(LITE_WITH_CV AND (NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
     lite_cc_library(paddle_cv_arm SRCS
             image_convert.cc
+            bgr_rotate.cc
             paddle_image_preprocess.cc
             image2tensor.cc
             image_flip.cc
diff --git a/lite/utils/cv/bgr_rotate.cc b/lite/utils/cv/bgr_rotate.cc
new file mode 100644
index 0000000000000000000000000000000000000000..333bf8575515fe4f5e063f8e55610c111c377571
--- /dev/null
+++ b/lite/utils/cv/bgr_rotate.cc
@@ -0,0 +1,1506 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// ncnn license
+// Tencent is pleased to support the open source community by making ncnn
+// available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this
+// file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "lite/utils/cv/bgr_rotate.h"
+#include <arm_neon.h>
+#include <math.h>
+#include <algorithm>
+namespace paddle {
+namespace lite {
+namespace utils {
+namespace cv {
+void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void bgr_rotate_hwc(
+    const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle) {
+  if (angle == 90) {
+    rotate90_hwc(src, dst, w_in, h_in);
+  }
+  if (angle == 270) {
+    rotate270_hwc(src, dst, w_in, h_in);
+  }
+  if (angle == 180) {
+    rotate180_hwc(src, dst, w_in, h_in);
+  }
+}
+
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr7 bgr4 bgr1
+bgr8 bgr5 bgr2
+bgr9 bgr6 bgr3
+*/
+#ifdef __aarch64__
+void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int64_t stride_h = 4 * win;
+  int64_t stride_h_w = 4 * win - 24;
+  int ww = w_out - 8;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (i = 0; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w_in - 7; j += 8) {
+      uint8_t* outptr0 = dst + j * wout + (ww - i) * 3;
+      uint8_t* outptr1 = outptr0 + wout;
+      uint8_t* outptr2 = outptr1 + wout;
+      uint8_t* outptr3 = outptr2 + wout;
+      uint8_t* outptr4 = outptr3 + wout;
+      uint8_t* outptr5 = outptr4 + wout;
+      uint8_t* outptr6 = outptr5 + wout;
+      uint8_t* outptr7 = outptr6 + wout;
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]]    \n"  // v0={00,01,02, 03,
+                                                           // 04, 05, 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]]    \n"  // v0={10,11,12, 13,
+                                                           // 14, 15, 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+
+          "add %[inptr0], %[inptr0], %[stride_h] \n"  // 4 + 4*w_in
+          "add %[inptr1], %[inptr1], %[stride_h] \n"  // 5
+          "add %[inptr2], %[inptr2], %[stride_h] \n"  // 6
+          "add %[inptr3], %[inptr3], %[stride_h] \n"  // 7
+
+          // b
+          "trn1 v12.8b, v0.8b, v3.8b             \n"  // v4={00 10 02 12 04 14
+                                                      // 06 16 }
+          "trn1 v15.8b, v6.8b, v9.8b             \n"  // v4={20 30 22 32 24 34
+                                                      // 26 36 }
+
+          "trn2 v18.8b, v0.8b, v3.8b             \n"  // v5={01 11 03 13 05 15
+                                                      // 07 17 }
+          "trn2 v21.8b, v6.8b, v9.8b             \n"  // v7={21 31 23 33 25 35
+                                                      // 27 37 }
+
+          // g
+          "trn1 v13.8b, v1.8b, v4.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v16.8b, v7.8b, v10.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+
+          "trn2 v19.8b, v1.8b, v4.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v22.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // r
+          "trn1 v14.8b, v2.8b, v5.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v17.8b, v8.8b, v11.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+
+          "trn2 v20.8b, v2.8b, v5.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v23.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // b1
+          "trn1 v24.4h, v12.4h, v15.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v27.4h, v18.4h, v21.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v0.4h, v12.4h, v15.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v3.4h, v18.4h, v21.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          // g1
+          "trn1 v25.4h, v13.4h, v16.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v28.4h, v19.4h, v22.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v1.4h, v13.4h, v16.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v4.4h, v19.4h, v22.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          // r1
+          "trn1 v26.4h, v14.4h, v17.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v29.4h, v20.4h, v23.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v2.4h, v14.4h, v17.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v5.4h, v20.4h, v23.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          "ld3  {v12.8b, v13.8b, v14.8b}, [%[inptr0]]    \n"  // v0={00,01,02,
+                                                              // 03, 04, 05, 06,
+                                                              // 07}"
+          "ld3  {v15.8b, v16.8b, v17.8b}, [%[inptr1]]    \n"  // v0={10,11,12,
+                                                              // 13, 14, 15, 16,
+                                                              // 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+
+          "sub %[inptr0], %[inptr0], %[stride_h_w] \n"  // 4 - 4*w_in + 8
+          "sub %[inptr1], %[inptr1], %[stride_h_w] \n"  // 5
+          "sub %[inptr2], %[inptr2], %[stride_h_w] \n"  // 6
+          "sub %[inptr3], %[inptr3], %[stride_h_w] \n"  // 7
+
+          // b2
+          "trn1 v18.8b, v12.8b, v15.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v21.8b, v6.8b, v9.8b             \n"    // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // g2
+          "trn1 v19.8b, v13.8b, v16.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v22.8b, v7.8b, v10.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // r2
+          "trn1 v20.8b, v14.8b, v17.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v23.8b, v8.8b, v11.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+
+          "trn2 v12.8b, v12.8b, v15.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v13.8b, v13.8b, v16.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v14.8b, v14.8b, v17.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+
+          "trn2 v15.8b, v6.8b, v9.8b             \n"   // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v16.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v17.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // b2
+          "trn1 v6.4h, v18.4h, v21.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // g2
+          "trn1 v7.4h, v19.4h, v22.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // r2
+          "trn1 v8.4h, v20.4h, v23.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+
+          // bgr
+          "trn1 v9.4h, v12.4h, v15.4h             \n"   // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v10.4h, v13.4h, v16.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v11.4h, v14.4h, v17.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          // bgr
+          "trn2 v18.4h, v18.4h, v21.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v19.4h, v19.4h, v22.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v20.4h, v20.4h, v23.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+
+          // bgr
+          "trn2 v21.4h, v12.4h, v15.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v22.4h, v13.4h, v16.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v23.4h, v14.4h, v17.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+
+          // b1 b2
+          "trn1 v12.2s, v24.2s, v6.2s             \n"  // v8={00 10 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v13.2s, v25.2s, v7.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v14.2s, v26.2s, v8.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn2 v15.2s, v24.2s, v6.2s             \n"  // v8={04 14 24 34 44 54
+                                                       // 64 74} b
+          "trn2 v16.2s, v25.2s, v7.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} g
+          "trn2 v17.2s, v26.2s, v8.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} r
+
+          // b1 b2
+          "trn1 v6.2s, v27.2s, v9.2s             \n"   // v8={01 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v7.2s, v28.2s, v10.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v8.2s, v29.2s, v11.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} r
+
+          "rev64  v12.8b, v12.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v13.8b, v13.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v14.8b, v14.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+          "rev64  v15.8b, v15.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v16.8b, v16.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v17.8b, v17.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          // b1 b2
+          "trn2 v24.2s, v27.2s, v9.2s             \n"   // v8={05 10 20 30 40 50
+                                                        // 60 70} b
+          "trn2 v25.2s, v28.2s, v10.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} g
+          "trn2 v26.2s, v29.2s, v11.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} r
+
+          // "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24             \n"
+          // //00 10 20 30 04 14 24 34
+          // "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24             \n"
+          // //02 12 22 32
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24             \n"  // 00 10 20 30 04 14 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24             \n"  // 02 12 22 32
+          // b1 b2
+          "trn1 v9.2s, v0.2s, v18.2s             \n"   // v8={02 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v10.2s, v1.2s, v19.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v11.2s, v2.2s, v20.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} r
+
+          "trn2 v27.2s, v0.2s, v18.2s             \n"  // v8={06 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v28.2s, v1.2s, v19.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v29.2s, v2.2s, v20.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn1 v0.2s, v3.2s, v21.2s             \n"  // v8={03 11 20 30 40 50
+                                                      // 60 70} b
+          "trn1 v1.2s, v4.2s, v22.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} g
+          "trn1 v2.2s, v5.2s, v23.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} r
+
+          "trn2 v18.2s, v3.2s, v21.2s             \n"  // v8={07 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v19.2s, v4.2s, v22.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v20.2s, v5.2s, v23.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} r
+
+          "rev64  v6.8b, v6.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 b
+          "rev64  v7.8b, v7.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 g
+          "rev64  v8.8b, v8.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 r
+
+          "rev64  v24.8b, v24.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v25.8b, v25.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v26.8b, v26.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          "rev64  v9.8b, v9.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 b
+          "rev64  v10.8b, v10.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v11.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          "rev64  v27.8b, v27.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v28.8b, v28.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v29.8b, v29.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          "rev64  v0.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 b
+          "rev64  v1.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 g
+          "rev64  v2.8b, v2.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 r
+
+          "rev64  v18.8b, v18.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v19.8b, v19.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v20.8b, v20.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          "st3 {v6.8b, v7.8b, v8.8b}, [%[outptr1]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v24.8b, v25.8b, v26.8b}, [%[outptr5]], #24             \n"  // 02 12 22 32
+
+          "st3 {v9.8b, v10.8b, v11.8b}, [%[outptr2]], #24             \n"  // 00
+                                                                           // 10
+                                                                           // 20
+                                                                           // 30
+                                                                           // 04
+                                                                           // 14
+                                                                           // 24
+                                                                           // 34
+          "st3 {v27.8b, v28.8b, v29.8b}, [%[outptr6]], #24             \n"  // 02 12 22 32
+
+          "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr3]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr7]], #24             \n"  // 02 12 22 32
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [outptr4] "+r"(outptr4),
+            [outptr5] "+r"(outptr5),
+            [outptr6] "+r"(outptr6),
+            [outptr7] "+r"(outptr7),
+            [stride_h] "+r"(stride_h),
+            [stride_h_w] "+r"(stride_h_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23",
+            "v24",
+            "v25",
+            "v26",
+            "v27",
+            "v28",
+            "v29",
+            "v30");
+    }
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    for (; j < w_in; j++) {
+      int tmpx = (ww - i) * 3;
+      uint8_t* outptr = dst + j * wout + tmpx;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+    }
+  }
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + j * wout + (w_out - 1 - i) * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#else
+void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int hremain = h_in % 8;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 24;
+  int ww = w_out - 8;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (i = 0; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr4]]            @ preload a, 64byte\n"
+        "pld [%[ptr4], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr5]]            @ preload a, 64byte\n"
+        "pld [%[ptr5], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr6]]            @ preload a, 64byte\n"
+        "pld [%[ptr6], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr7]]            @ preload a, 64byte\n"
+        "pld [%[ptr7], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+    int j = 0;
+    for (; j < w_in; j++) {
+      int tmpx = (ww - i) * 3;
+      uint8_t* outptr = dst + j * wout + tmpx;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+    }
+  }
+  ww = w_out - 1;
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + j * wout + (ww - i) * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#endif
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr3 bgr6 bgr9
+bgr2 bgr5 bgr8
+bgr1 bgr4 bgr7
+*/
+// dst = (h_out - 1) * w_out
+// 类似rotate90，将输出结果倒着输出 或者先rotate90,然后沿Y轴翻转
+#ifdef __aarch64__
+void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int64_t stride_h = 4 * win;
+  int64_t stride_h_w = 4 * win - 24;
+  int hout = h_out - 1;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w_in - 7; j += 8) {
+      uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
+      uint8_t* outptr1 = outptr0 - wout;
+      uint8_t* outptr2 = outptr1 - wout;
+      uint8_t* outptr3 = outptr2 - wout;
+      uint8_t* outptr4 = outptr3 - wout;
+      uint8_t* outptr5 = outptr4 - wout;
+      uint8_t* outptr6 = outptr5 - wout;
+      uint8_t* outptr7 = outptr6 - wout;
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]]    \n"  // v0={00,01,02, 03,
+                                                           // 04, 05, 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]]    \n"  // v0={10,11,12, 13,
+                                                           // 14, 15, 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+
+          "add %[inptr0], %[inptr0], %[stride_h] \n"  // 4 + 4*w_in
+          "add %[inptr1], %[inptr1], %[stride_h] \n"  // 5
+          "add %[inptr2], %[inptr2], %[stride_h] \n"  // 6
+          "add %[inptr3], %[inptr3], %[stride_h] \n"  // 7
+
+          // b
+          "trn1 v12.8b, v0.8b, v3.8b             \n"  // v4={00 10 02 12 04 14
+                                                      // 06 16 }
+          "trn1 v15.8b, v6.8b, v9.8b             \n"  // v4={20 30 22 32 24 34
+                                                      // 26 36 }
+
+          "trn2 v18.8b, v0.8b, v3.8b             \n"  // v5={01 11 03 13 05 15
+                                                      // 07 17 }
+          "trn2 v21.8b, v6.8b, v9.8b             \n"  // v7={21 31 23 33 25 35
+                                                      // 27 37 }
+
+          // g
+          "trn1 v13.8b, v1.8b, v4.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v16.8b, v7.8b, v10.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+
+          "trn2 v19.8b, v1.8b, v4.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v22.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // r
+          "trn1 v14.8b, v2.8b, v5.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v17.8b, v8.8b, v11.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+
+          "trn2 v20.8b, v2.8b, v5.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v23.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // b1
+          "trn1 v24.4h, v12.4h, v15.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v27.4h, v18.4h, v21.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v0.4h, v12.4h, v15.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v3.4h, v18.4h, v21.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          // g1
+          "trn1 v25.4h, v13.4h, v16.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v28.4h, v19.4h, v22.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v1.4h, v13.4h, v16.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v4.4h, v19.4h, v22.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          // r1
+          "trn1 v26.4h, v14.4h, v17.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v29.4h, v20.4h, v23.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v2.4h, v14.4h, v17.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v5.4h, v20.4h, v23.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          "ld3  {v12.8b, v13.8b, v14.8b}, [%[inptr0]]    \n"  // v0={00,01,02,
+                                                              // 03, 04, 05, 06,
+                                                              // 07}"
+          "ld3  {v15.8b, v16.8b, v17.8b}, [%[inptr1]]    \n"  // v0={10,11,12,
+                                                              // 13, 14, 15, 16,
+                                                              // 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+
+          "sub %[inptr0], %[inptr0], %[stride_h_w] \n"  // 4 - 4*w_in + 8
+          "sub %[inptr1], %[inptr1], %[stride_h_w] \n"  // 5
+          "sub %[inptr2], %[inptr2], %[stride_h_w] \n"  // 6
+          "sub %[inptr3], %[inptr3], %[stride_h_w] \n"  // 7
+
+          // b2
+          "trn1 v18.8b, v12.8b, v15.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v21.8b, v6.8b, v9.8b             \n"    // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // g2
+          "trn1 v19.8b, v13.8b, v16.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v22.8b, v7.8b, v10.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // r2
+          "trn1 v20.8b, v14.8b, v17.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v23.8b, v8.8b, v11.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+
+          "trn2 v12.8b, v12.8b, v15.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v13.8b, v13.8b, v16.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v14.8b, v14.8b, v17.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+
+          "trn2 v15.8b, v6.8b, v9.8b             \n"   // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v16.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v17.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // b2
+          "trn1 v6.4h, v18.4h, v21.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // g2
+          "trn1 v7.4h, v19.4h, v22.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // r2
+          "trn1 v8.4h, v20.4h, v23.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+
+          // bgr
+          "trn1 v9.4h, v12.4h, v15.4h             \n"   // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v10.4h, v13.4h, v16.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v11.4h, v14.4h, v17.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          // bgr
+          "trn2 v18.4h, v18.4h, v21.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v19.4h, v19.4h, v22.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v20.4h, v20.4h, v23.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+
+          // bgr
+          "trn2 v21.4h, v12.4h, v15.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v22.4h, v13.4h, v16.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v23.4h, v14.4h, v17.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+
+          // b1 b2
+          "trn1 v12.2s, v24.2s, v6.2s             \n"  // v8={00 10 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v13.2s, v25.2s, v7.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v14.2s, v26.2s, v8.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn2 v15.2s, v24.2s, v6.2s             \n"  // v8={04 14 24 34 44 54
+                                                       // 64 74} b
+          "trn2 v16.2s, v25.2s, v7.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} g
+          "trn2 v17.2s, v26.2s, v8.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} r
+
+          // b1 b2
+          "trn1 v6.2s, v27.2s, v9.2s             \n"   // v8={01 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v7.2s, v28.2s, v10.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v8.2s, v29.2s, v11.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn2 v24.2s, v27.2s, v9.2s             \n"   // v8={05 10 20 30 40 50
+                                                        // 60 70} b
+          "trn2 v25.2s, v28.2s, v10.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} g
+          "trn2 v26.2s, v29.2s, v11.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} r
+
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24             \n"  // 00 10 20 30 04 14 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24             \n"  // 02 12 22 32
+          // b1 b2
+          "trn1 v9.2s, v0.2s, v18.2s             \n"   // v8={02 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v10.2s, v1.2s, v19.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v11.2s, v2.2s, v20.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} r
+
+          "trn2 v27.2s, v0.2s, v18.2s             \n"  // v8={06 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v28.2s, v1.2s, v19.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v29.2s, v2.2s, v20.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn1 v0.2s, v3.2s, v21.2s             \n"  // v8={03 11 20 30 40 50
+                                                      // 60 70} b
+          "trn1 v1.2s, v4.2s, v22.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} g
+          "trn1 v2.2s, v5.2s, v23.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} r
+
+          "trn2 v18.2s, v3.2s, v21.2s             \n"  // v8={07 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v19.2s, v4.2s, v22.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v20.2s, v5.2s, v23.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} r
+
+          "st3 {v6.8b, v7.8b, v8.8b}, [%[outptr1]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v24.8b, v25.8b, v26.8b}, [%[outptr5]], #24             \n"  // 02 12 22 32
+
+          "st3 {v9.8b, v10.8b, v11.8b}, [%[outptr2]], #24             \n"  // 00
+                                                                           // 10
+                                                                           // 20
+                                                                           // 30
+                                                                           // 04
+                                                                           // 14
+                                                                           // 24
+                                                                           // 34
+          "st3 {v27.8b, v28.8b, v29.8b}, [%[outptr6]], #24             \n"  // 02 12 22 32
+
+          "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr3]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr7]], #24             \n"  // 02 12 22 32
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [outptr4] "+r"(outptr4),
+            [outptr5] "+r"(outptr5),
+            [outptr6] "+r"(outptr6),
+            [outptr7] "+r"(outptr7),
+            [stride_h] "+r"(stride_h),
+            [stride_h_w] "+r"(stride_h_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23",
+            "v24",
+            "v25",
+            "v26",
+            "v27",
+            "v28",
+            "v29");
+    }
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    for (; j < w_in; j++) {
+      int tmpx = i * 3;
+      uint8_t* outptr = dst + (hout - j) * wout + tmpx;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+    }
+  }
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#else
+void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int hremain = h_in % 8;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 24;
+  int hout = h_out - 1;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr4]]            @ preload a, 64byte\n"
+        "pld [%[ptr4], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr5]]            @ preload a, 64byte\n"
+        "pld [%[ptr5], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr6]]            @ preload a, 64byte\n"
+        "pld [%[ptr6], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr7]]            @ preload a, 64byte\n"
+        "pld [%[ptr7], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+    int j = 0;
+    for (; j < w_in; j++) {
+      int tmpx = i * 3;
+      uint8_t* outptr = dst + (hout - j) * wout + tmpx;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+    }
+  }
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#endif
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr9 bgr8 bgr7
+bgr6 bgr5 bgr4
+bgr3 bgr2 bgr1
+*/
+// filp y
+#ifdef __aarch64__
+void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 3;
+  uint8_t* zerobuff = new uint8_t[w_in];
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  int64_t stride_w = 24;
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last col
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 3:
+            inptr0 = zerobuff;
+            outptr0 = zerobuff;
+          case 2:
+            inptr1 = zerobuff;
+            outptr1 = zerobuff;
+          case 1:
+            inptr2 = zerobuff;
+            outptr2 = zerobuff;
+          case 0:
+            inptr3 = zerobuff;
+            outptr3 = zerobuff;
+          default:
+            break;
+        }
+      }
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24    \n"  // v0={00,01,02,
+                                                                // 03, 04, 05,
+                                                                // 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24     \n"  // v0={10,11,12,
+                                                                 // 13, 14, 15,
+                                                                 // 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24    \n"  // v0={20,21,22,
+                                                                // 23, 24, 25,
+                                                                // 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24    \n"  // v0={30,31,32,
+                                                                  // 33, 34, 35,
+                                                                  // 36, 37}"
+
+          "rev64  v12.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 b
+          "rev64  v13.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 g
+          "rev64  v14.8b, v2.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 r
+
+          "rev64  v15.8b, v3.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v16.8b, v4.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v17.8b, v5.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "rev64  v18.8b, v6.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v19.8b, v7.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v20.8b, v8.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "rev64  v21.8b, v9.8b                \n"   //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "rev64  v22.8b, v10.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "rev64  v23.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
+
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]]             \n"   // 00 10
+                                                                        // 20 30
+                                                                        // 04 14
+                                                                        // 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]]              \n"  // 02 12
+                                                                        // 22 32
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]]             \n"   // 01 11
+                                                                        // 21 31
+          "st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]]              \n"  // 03 13
+                                                                        // 23 33
+
+          "sub %[outptr0], %[outptr0], %[stride_w]       \n"  //@ ptr - stride_w
+          "sub %[outptr1], %[outptr1], %[stride_w]       \n"
+          "sub %[outptr2], %[outptr2], %[stride_w]       \n"
+          "sub %[outptr3], %[outptr3], %[stride_w]       \n"
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23");
+    }
+    outptr3 += stride_w - 3;
+    outptr2 += stride_w - 3;
+    outptr1 += stride_w - 3;
+    outptr0 += stride_w - 3;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 6;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 6;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 6;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 6;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 6;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 6;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 6;
+      }
+    }
+  }
+  delete[] zerobuff;
+}
+#else
+void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 3;
+  uint8_t* zerobuff = new uint8_t[w_in];
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  int stride_w = 24;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 3:
+            inptr0 = zerobuff;
+            outptr0 = zerobuff;
+          case 2:
+            inptr1 = zerobuff;
+            outptr1 = zerobuff;
+          case 1:
+            inptr2 = zerobuff;
+            outptr2 = zerobuff;
+          case 0:
+            inptr3 = zerobuff;
+            outptr3 = zerobuff;
+          default:
+            break;
+        }
+      }
+      asm volatile(
+          "vld3.8  {d0, d1, d2}, [%[inptr0]]!   @ zip load r0, d0 =00 01 02 03 "
+          "04 05 06 07\n"
+          "vld3.8  {d3, d4, d5}, [%[inptr1]]!   @ zip load r1, d2 =10 11 12 13 "
+          "14 15 16 17\n"
+          "vld3.8  {d6, d7, d8}, [%[inptr2]]!   @ zip load r1, d4 =20 21 22 23 "
+          "24 25 26 27\n"
+          "vld3.8  {d9, d10, d11}, [%[inptr3]]!   @ zip load r1, d6 = 30 31 32 "
+          "33 34 35 36 37\n"
+
+          "vrev64.8  d12, d0               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d13, d1               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d14, d2               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d15, d3               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d16, d4               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d17, d5               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d18, d6               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d19, d7               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d20, d8               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d21, d9               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d22, d10               @ reverse 07 06 05 04 03 02 01 00 "
+          "\n"
+          "vrev64.8  d23, d11               @ reverse 07 06 05 04 03 02 01 00 "
+          "\n"
+
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
+
+          "vst3.8  {d12, d13, d14},    [%[outptr0]]   @ write "
+          "d0(q0,low),r00,r10 20 30\n"
+          "vst3.8  {d15, d16, d17},    [%[outptr1]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst3.8  {d18, d19, d20},    [%[outptr2]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst3.8  {d21, d22, d23},    [%[outptr3]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+
+          "sub %[outptr0], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr1], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr2], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr3], %[stride_w]       @ ptr - stride_w \n"
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "q0",
+            "q1",
+            "q2",
+            "q3",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12");
+    }
+    outptr3 += stride_w - 3;
+    outptr2 += stride_w - 3;
+    outptr1 += stride_w - 3;
+    outptr0 += stride_w - 3;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 6;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 6;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 6;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 6;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 6;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 6;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 6;
+      }
+    }
+  }
+  delete[] zerobuff;
+}
+#endif
+}  // namespace cv
+}  // namespace utils
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/host/use_kernels.h b/lite/utils/cv/bgr_rotate.h
similarity index 70%
rename from lite/kernels/host/use_kernels.h
rename to lite/utils/cv/bgr_rotate.h
index b5bab46a7191fc6732ea515b22e175141b87dc48..bb85da56955154863eb17595ebb5b58d79cd6a83 100644
--- a/lite/kernels/host/use_kernels.h
+++ b/lite/utils/cv/bgr_rotate.h
@@ -13,9 +13,15 @@
 // limitations under the License.
 
 #pragma once
-#include "lite/core/op_registry.h"
 
-USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def);
+#include <stdint.h>
+namespace paddle {
+namespace lite {
+namespace utils {
+namespace cv {
+void bgr_rotate_hwc(
+    const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle);
+}  // namespace cv
+}  // namespace utils
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/utils/cv/image2tensor.cc b/lite/utils/cv/image2tensor.cc
index 3a09039a0f53c9ac49a472b61b477dd6d2e5ac33..70f0da3f050c05cfcb716f8e99e83ddbcce80091 100644
--- a/lite/utils/cv/image2tensor.cc
+++ b/lite/utils/cv/image2tensor.cc
@@ -142,15 +142,15 @@ void gray_to_tensor(const uint8_t* src,
           "ucvtf v14.4s, v8.4s \n"
           "ucvtf v15.4s, v9.4s \n"
           // sub -mean
-          "fsub v12.4s, v12.4s, %w[vmean].4s \n"
-          "fsub v13.4s, v13.4s, %w[vmean].4s \n"
-          "fsub v14.4s, v14.4s, %w[vmean].4s \n"
-          "fsub v15.4s, v15.4s, %w[vmean].4s \n"
+          "fsub v12.4s, v12.4s, %[vmean].4s \n"
+          "fsub v13.4s, v13.4s, %[vmean].4s \n"
+          "fsub v14.4s, v14.4s, %[vmean].4s \n"
+          "fsub v15.4s, v15.4s, %[vmean].4s \n"
           // mul * scale
-          "fmul v6.4s, v12.4s, %w[vscale].4s \n"
-          "fmul v7.4s, v13.4s, %w[vscale].4s \n"
-          "fmul v8.4s, v14.4s, %w[vscale].4s \n"
-          "fmul v9.4s, v15.4s, %w[vscale].4s \n"
+          "fmul v6.4s, v12.4s, %[vscale].4s \n"
+          "fmul v7.4s, v13.4s, %[vscale].4s \n"
+          "fmul v8.4s, v14.4s, %[vscale].4s \n"
+          "fmul v9.4s, v15.4s, %[vscale].4s \n"
           // store
           "st1 {v6.4s}, [%[outr0]], #16 \n"
           "subs %w[cnt], %w[cnt], #1 \n"
@@ -301,19 +301,19 @@ void bgr_to_tensor_chw(const uint8_t* src,
           "ucvtf v16.4s, v10.4s \n"
           "ucvtf v17.4s, v11.4s \n"
           // sub -mean
-          "fsub v12.4s, v12.4s, %w[vbmean].4s \n"
-          "fsub v13.4s, v13.4s, %w[vbmean].4s \n"
-          "fsub v14.4s, v14.4s, %w[vgmean].4s \n"
-          "fsub v15.4s, v15.4s, %w[vgmean].4s \n"
-          "fsub v16.4s, v16.4s, %w[vrmean].4s \n"
-          "fsub v17.4s, v17.4s, %w[vrmean].4s \n"
+          "fsub v12.4s, v12.4s, %[vbmean].4s \n"
+          "fsub v13.4s, v13.4s, %[vbmean].4s \n"
+          "fsub v14.4s, v14.4s, %[vgmean].4s \n"
+          "fsub v15.4s, v15.4s, %[vgmean].4s \n"
+          "fsub v16.4s, v16.4s, %[vrmean].4s \n"
+          "fsub v17.4s, v17.4s, %[vrmean].4s \n"
           // mul * scale
-          "fmul v6.4s, v12.4s, %w[vbscale].4s \n"
-          "fmul v7.4s, v13.4s, %w[vbscale].4s \n"
-          "fmul v8.4s, v14.4s, %w[vgscale].4s \n"
-          "fmul v9.4s, v15.4s, %w[vgscale].4s \n"
-          "fmul v10.4s, v16.4s, %w[vrscale].4s \n"
-          "fmul v11.4s, v17.4s, %w[vrscale].4s \n"
+          "fmul v6.4s, v12.4s, %[vbscale].4s \n"
+          "fmul v7.4s, v13.4s, %[vbscale].4s \n"
+          "fmul v8.4s, v14.4s, %[vgscale].4s \n"
+          "fmul v9.4s, v15.4s, %[vgscale].4s \n"
+          "fmul v10.4s, v16.4s, %[vrscale].4s \n"
+          "fmul v11.4s, v17.4s, %[vrscale].4s \n"
           // store
           "st1 {v6.4s}, [%[outr0]], #16 \n"
           "st1 {v8.4s}, [%[outr1]], #16 \n"
diff --git a/lite/utils/cv/image_convert.cc b/lite/utils/cv/image_convert.cc
index 385f56d233cb151445a086ed59d5c40374cd8c36..5953b871f40f865591e1e933b4ecc492970a2837 100644
--- a/lite/utils/cv/image_convert.cc
+++ b/lite/utils/cv/image_convert.cc
@@ -829,12 +829,9 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
   uint8x8_t vb = vdup_n_u8(b);
   uint8x8_t vg = vdup_n_u8(g);
   uint8x8_t vr = vdup_n_u8(r);
-#ifdef __aarch64__
-#else
   uint8_t vb_array[8] = {b, b, b, b, b, b, b, b};
   uint8_t vg_array[8] = {g, g, g, g, g, g, g, g};
   uint8_t vr_array[8] = {r, r, r, r, r, r, r, r};
-#endif
   int cnt_pro = srcw >> 3;
   int remain_pro = srcw % 8;
   int win = srcw * 3;
@@ -863,6 +860,9 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
           "prfm   pldl1keep, [%[inptr2], #128]   \n"
           "prfm   pldl1keep, [%[inptr3]]                \n"
           "prfm   pldl1keep, [%[inptr3], #128]   \n"
+          "ld1 {v21.8b}, [%[vb]]                 \n"
+          "ld1 {v22.8b}, [%[vg]]                 \n"
+          "ld1 {v23.8b}, [%[vr]]                 \n"
           "1: \n"
           "ld3 {v0.8b - v2.8b}, [%[inptr0]], #24 \n"   // d8 = y0y3y6y9.. d9 =
                                                        // y1y4y7...
@@ -873,20 +873,20 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
           "ld3 {v9.8b - v11.8b}, [%[inptr3]], #24 \n"  // d8 = y0y3y6y9.. d9 =
                                                        // y1y4y7...
           // mul b
-          "umull v12.8h, v0.8b, %w[vb].8b \n"  // v0 * vb
-          "umull v13.8h, v3.8b, %w[vb].8b \n"  // v0 * vb
-          "umull v14.8h, v6.8b, %w[vb].8b \n"  // v0 * vb
-          "umull v15.8h, v9.8b, %w[vb].8b \n"  // v0 * vb
+          "umull v12.8h, v0.8b, v21.8b \n"  // v0 * vb
+          "umull v13.8h, v3.8b, v21.8b \n"  // v0 * vb
+          "umull v14.8h, v6.8b, v21.8b \n"  // v0 * vb
+          "umull v15.8h, v9.8b, v21.8b \n"  // v0 * vb
           // mul g
-          "umull v16.8h, v1.8b, %w[vg].8b \n"   // v0 * vb
-          "umull v17.8h, v4.8b, %w[vg].8b \n"   // v0 * vb
-          "umull v18.8h, v7.8b, %w[vg].8b \n"   // v0 * vb
-          "umull v19.8h, v10.8b, %w[vg].8b \n"  // v0 * vb
+          "umull v16.8h, v1.8b, v22.8b \n"   // v0 * vb
+          "umull v17.8h, v4.8b, v22.8b \n"   // v0 * vb
+          "umull v18.8h, v7.8b, v22.8b \n"   // v0 * vb
+          "umull v19.8h, v10.8b, v22.8b \n"  // v0 * vb
           // mul r
-          "umlal v12.8h, v2.8b, %w[vr].8b \n"   // v0 * vb
-          "umlal v13.8h, v5.8b, %w[vr].8b \n"   // v0 * vb
-          "umlal v14.8h, v8.8b, %w[vr].8b \n"   // v0 * vb
-          "umlal v15.8h, v11.8b, %w[vr].8b \n"  // v0 * vb
+          "umlal v12.8h, v2.8b, v23.8b \n"   // v0 * vb
+          "umlal v13.8h, v5.8b, v23.8b \n"   // v0 * vb
+          "umlal v14.8h, v8.8b, v23.8b \n"   // v0 * vb
+          "umlal v15.8h, v11.8b, v23.8b \n"  // v0 * vb
           // 16->32
           "uaddl v0.4s, v16.4h, v12.4h \n"
           "uaddl2 v1.4s, v16.8h, v12.8h \n"
@@ -925,7 +925,7 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
             [outr2] "+r"(outr2),
             [outr3] "+r"(outr3),
             [cnt] "+r"(cnt)
-          : [vb] "w"(vb), [vg] "w"(vg), [vr] "w"(vr)
+          : [vb] "r"(vb_array), [vg] "r"(vg_array), [vr] "r"(vr_array)
           : "cc",
             "memory",
             "v0",
@@ -948,7 +948,10 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
             "v17",
             "v18",
             "v19",
-            "v20");
+            "v20",
+            "v21",
+            "v22",
+            "v23");
 #else
       asm volatile(
           "pld [%[inptr0]]                         @ preload a, 64byte\n"
@@ -1103,12 +1106,9 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
   uint8x8_t vb = vdup_n_u8(b);
   uint8x8_t vg = vdup_n_u8(g);
   uint8x8_t vr = vdup_n_u8(r);
-#ifdef __aarch64__
-#else
   uint8_t vb_array[8] = {b, b, b, b, b, b, b, b};
   uint8_t vg_array[8] = {g, g, g, g, g, g, g, g};
   uint8_t vr_array[8] = {r, r, r, r, r, r, r, r};
-#endif
   int cnt_pro = srcw >> 3;
   int remain_pro = srcw % 8;
   int win = srcw * 4;
@@ -1137,6 +1137,9 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
           "prfm   pldl1keep, [%[inptr2], #128]   \n"
           "prfm   pldl1keep, [%[inptr3]]                \n"
           "prfm   pldl1keep, [%[inptr3], #128]   \n"
+          "ld1 {v21.8b}, [%[vb]]                 \n"
+          "ld1 {v22.8b}, [%[vg]]                 \n"
+          "ld1 {v23.8b}, [%[vr]]                 \n"
           "1: \n"
           "ld4 {v0.8b - v3.8b}, [%[inptr0]], #32 \n"    // d8 = y0y3y6y9.. d9 =
                                                         // y1y4y7...
@@ -1147,20 +1150,20 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
           "ld4 {v12.8b - v15.8b}, [%[inptr3]], #32 \n"  // d8 = y0y3y6y9.. d9 =
                                                         // y1y4y7...
           // mul b
-          "umull v13.8h, v0.8b, %w[vb].8b \n"   // v0 * vb
-          "umull v14.8h, v4.8b, %w[vb].8b \n"   // v0 * vb
-          "umull v15.8h, v8.8b, %w[vb].8b \n"   // v0 * vb
-          "umull v16.8h, v12.8b, %w[vb].8b \n"  // v0 * vb
+          "umull v13.8h, v0.8b, v21.8b \n"   // v0 * vb
+          "umull v14.8h, v4.8b, v21.8b \n"   // v0 * vb
+          "umull v15.8h, v8.8b, v21.8b \n"   // v0 * vb
+          "umull v16.8h, v12.8b, v21.8b \n"  // v0 * vb
           // mul g
-          "umull v17.8h, v1.8b, %w[vg].8b \n"   // v0 * vb
-          "umull v18.8h, v5.8b, %w[vg].8b \n"   // v0 * vb
-          "umull v19.8h, v9.8b, %w[vg].8b \n"   // v0 * vb
-          "umull v20.8h, v13.8b, %w[vg].8b \n"  // v0 * vb
+          "umull v17.8h, v1.8b, v22.8b \n"   // v0 * vb
+          "umull v18.8h, v5.8b, v22.8b \n"   // v0 * vb
+          "umull v19.8h, v9.8b, v22.8b \n"   // v0 * vb
+          "umull v20.8h, v13.8b, v22.8b \n"  // v0 * vb
           // mul r
-          "umlal v13.8h, v2.8b, %w[vr].8b \n"   // v0 * vb
-          "umlal v14.8h, v6.8b, %w[vr].8b \n"   // v0 * vb
-          "umlal v15.8h, v10.8b, %w[vr].8b \n"  // v0 * vb
-          "umlal v16.8h, v14.8b, %w[vr].8b \n"  // v0 * vb
+          "umlal v13.8h, v2.8b, v23.8b \n"   // v0 * vb
+          "umlal v14.8h, v6.8b, v23.8b \n"   // v0 * vb
+          "umlal v15.8h, v10.8b, v23.8b \n"  // v0 * vb
+          "umlal v16.8h, v14.8b, v23.8b \n"  // v0 * vb
           // 16->32
           "uaddl v0.4s, v17.4h, v13.4h \n"
           "uaddl2 v1.4s, v17.8h, v13.8h \n"
@@ -1199,7 +1202,7 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
             [outr2] "+r"(outr2),
             [outr3] "+r"(outr3),
             [cnt] "+r"(cnt)
-          : [vb] "w"(vb), [vg] "w"(vg), [vr] "w"(vr)
+          : [vb] "r"(vb_array), [vg] "r"(vg_array), [vr] "r"(vr_array)
           : "cc",
             "memory",
             "v0",
@@ -1222,7 +1225,10 @@ void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
             "v17",
             "v18",
             "v19",
-            "v20");
+            "v20",
+            "v21",
+            "v22",
+            "v23");
 #else
       asm volatile(
           "pld [%[inptr0]]                         @ preload a, 64byte\n"
diff --git a/lite/utils/cv/image_flip.cc b/lite/utils/cv/image_flip.cc
index f535c858e4dddcd04a0ce8cfa7a727356df34d64..7b7936935d0c26e4d1f023f77063ce9ee8dd73ec 100644
--- a/lite/utils/cv/image_flip.cc
+++ b/lite/utils/cv/image_flip.cc
@@ -110,7 +110,8 @@ rotate:
 */
 void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
   int h = h_in - 1;
-  uint8_t zerobuff[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  uint8_t* zerobuff = new uint8_t[w_in];
+  memset(zerobuff, 0.0, sizeof(uint8_t) * w_in);
 #pragma omp parallel for
   for (int i = 0; i < h_in; i += 4) {
     const uint8_t* inptr0 = src + i * w_in;
@@ -233,7 +234,8 @@ flip:
 */
 void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
   int64_t stride_w = 8;
-  uint8_t zerobuff[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  uint8_t* zerobuff = new uint8_t[w_in];
+  memset(zerobuff, 0.0, sizeof(uint8_t) * w_in);
 #pragma omp parallel for
   for (int i = 0; i < h_in; i += 4) {
     const uint8_t* inptr0 = src + i * w_in;
@@ -386,7 +388,8 @@ flip:
 */
 void flip_hwc1_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
   int64_t stride_w = 8;
-  uint8_t zerobuff[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  uint8_t* zerobuff = new uint8_t[w_in];
+  memset(zerobuff, 0.0, sizeof(uint8_t) * w_in);
 #pragma omp parallel for
   for (int i = 0; i < h_in; i += 4) {
     const uint8_t* inptr0 = src + i * w_in;
@@ -398,17 +401,17 @@ void flip_hwc1_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
     uint8_t* outptr1 = outptr0 - w_in;
     uint8_t* outptr2 = outptr1 - w_in;
     uint8_t* outptr3 = outptr2 - w_in;
-    if (i + 3 >= h_in) {
-      switch ((i + 3) - h_in) {
+    if (i + 4 > h_in) {
+      switch ((i + 4) - h_in) {
         case 3:
-          inptr0 = zerobuff;
-          outptr0 = zerobuff;
-        case 2:
           inptr1 = zerobuff;
           outptr1 = zerobuff;
-        case 1:
+        case 2:
           inptr2 = zerobuff;
           outptr2 = zerobuff;
+        case 1:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff;
         case 0:
           inptr3 = zerobuff;
           outptr3 = zerobuff;
@@ -504,16 +507,16 @@ void flip_hwc1_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
     outptr1 += stride_w - 1;
     outptr0 += stride_w - 1;
     for (; j < w_in; j++) {
-      if (i + 3 >= h_in) {
-        switch ((i + 3) - h_in) {
-          case 0:
+      if (i + 4 > h_in) {
+        switch ((i + 4) - h_in) {
+          case 3:
             *outptr2-- = *inptr2++;
-          case 1:
+          case 2:
             *outptr1-- = *inptr1++;
           // inptr1 = zerobuff;
-          case 2:
+          case 1:
             *outptr0-- = *inptr0++;
-          case 3:
+          case 0:
           // inptr3 = zerobuff;
           default:
             break;
diff --git a/lite/utils/cv/image_resize.cc b/lite/utils/cv/image_resize.cc
index cd02a2cf4bd0bdfa0f2c45ed2cf0b1ead803480c..f4a80ed6255186b8c1b59a8d56fd64b78c9bc1d2 100644
--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
@@ -47,10 +47,597 @@ void ImageResize::choose(const uint8_t* src,
                          int dsth) {
   resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
 }
+void resize_three_channel(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+
+void bgr_resize(const uint8_t* src,
+                uint8_t* dst,
+                int w_in,
+                int h_in,
+                int w_out,
+                int h_out) {
+  if (w_out == w_in && h_out == h_in) {
+    memcpy(dst, src, sizeof(uint8_t) * w_in * h_in * 3);
+    return;
+  }
+  // y
+  resize_three_channel(src, w_in * 3, h_in, dst, w_out * 3, h_out);
+}
+void resize_three_channel(const uint8_t* src,
+                          int w_in,
+                          int h_in,
+                          uint8_t* dst,
+                          int w_out,
+                          int h_out) {
+  const int resize_coef_bits = 11;
+  const int resize_coef_scale = 1 << resize_coef_bits;
+  double scale_x = static_cast<double>(w_in) / w_out;
+  double scale_y = static_cast<double>(h_in) / h_out;
+  int* buf = new int[w_out * 2 + h_out * 2];
+  int* xofs = buf;          // new int[w];
+  int* yofs = buf + w_out;  // new int[h];
+  int16_t* ialpha =
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
+  int16_t* ibeta =
+      reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out);  // new short[h * 2];
+  float fx = 0.f;
+  float fy = 0.f;
+  int sx = 0.f;
+  int sy = 0.f;
+#define SATURATE_CAST_SHORT(X)                                               \
+  (int16_t)::std::min(                                                       \
+      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
+      SHRT_MAX);
+  // #pragma omp parallel for
+  for (int dx = 0; dx < w_out / 3; dx++) {
+    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
+    sx = floor(fx);
+    fx -= sx;
+    if (sx < 0) {
+      sx = 0;
+      fx = 0.f;
+    }
+    if (sx >= w_in - 1) {
+      sx = w_in - 2;
+      fx = 1.f;
+    }
+    xofs[dx] = sx * 3;
+    float a0 = (1.f - fx) * resize_coef_scale;
+    float a1 = fx * resize_coef_scale;
+    ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
+    ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
+  }
+  // #pragma omp parallel for
+  for (int dy = 0; dy < h_out; dy++) {
+    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
+    sy = floor(fy);
+    fy -= sy;
+    if (sy < 0) {
+      sy = 0;
+      fy = 0.f;
+    }
+    if (sy >= h_in - 1) {
+      sy = h_in - 2;
+      fy = 1.f;
+    }
+    yofs[dy] = sy;
+    float b0 = (1.f - fy) * resize_coef_scale;
+    float b1 = fy * resize_coef_scale;
+    ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
+    ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
+  }
+#undef SATURATE_CAST_SHORT
+  // loop body
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+  int16_t* rows0 = rowsbuf0;
+  int16_t* rows1 = rowsbuf1;
+  int prev_sy1 = -1;
+  for (int dy = 0; dy < h_out; dy++) {
+    int sy = yofs[dy];
+    if (sy == prev_sy1) {
+      // hresize one row
+      int16_t* rows0_old = rows0;
+      rows0 = rows1;
+      rows1 = rows0_old;
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 3; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 3;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
+        ialphap += 2;
+      }
+    } else {
+      // hresize two rows
+      const uint8_t* S0 = src + w_in * (sy);
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows0p = rows0;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 3; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 3;
+        rows0p[tmp] = (S0p[0] * a0 + S0p[3] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
+        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[4] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
+        rows0p[tmp + 2] = (S0p[2] * a0 + S0p[5] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
+        ialphap += 2;
+      }
+    }
+    prev_sy1 = sy + 1;
+    // vresize
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+    int16_t* rows0p = rows0;
+    int16_t* rows1p = rows1;
+    uint8_t* dp_ptr = dst + w_out * (dy);
+    int cnt = w_out >> 3;
+    int remain = w_out - (cnt << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+    for (cnt = w_out >> 3; cnt > 0; cnt--) {
+      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+      int32x4_t _acc = _v2;
+      _acc = vsraq_n_s32(
+          _acc, _rows0p_sr4_mb0, 16);  // _acc >> 16 + _rows0p_sr4_mb0 >> 16
+      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+      int32x4_t _acc_1 = _v2;
+      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);  // _acc >> 2
+      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+      vst1_u8(dp_ptr, _dout);
+      dp_ptr += 8;
+      rows0p += 8;
+      rows1p += 8;
+    }
+    for (; remain; --remain) {
+      // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+      *dp_ptr++ =
+          (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
+                     (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
+                    2);
+    }
+    ibeta += 2;
+  }
+  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
+}
+void resize_one_channel(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+void resize_one_channel_uv(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+void nv21_resize(const uint8_t* src,
+                 uint8_t* dst,
+                 int w_in,
+                 int h_in,
+                 int w_out,
+                 int h_out) {
+  if (w_out == w_in && h_out == h_in) {
+    memcpy(dst, src, sizeof(uint8_t) * w_in * static_cast<int>(1.5 * h_in));
+    return;
+  }
+  //     return;
+  int y_h = h_in;
+  int uv_h = h_in / 2;
+  const uint8_t* y_ptr = src;
+  const uint8_t* uv_ptr = src + y_h * w_in;
+  // out
+  int dst_y_h = h_out;
+  int dst_uv_h = h_out / 2;
+  uint8_t* dst_ptr = dst + dst_y_h * w_out;
+  // y
+  resize_one_channel(y_ptr, w_in, y_h, dst, w_out, dst_y_h);
+  // uv
+  resize_one_channel_uv(uv_ptr, w_in, uv_h, dst_ptr, w_out, dst_uv_h);
+}
+
+void resize_one_channel(const uint8_t* src,
+                        int w_in,
+                        int h_in,
+                        uint8_t* dst,
+                        int w_out,
+                        int h_out) {
+  const int resize_coef_bits = 11;
+  const int resize_coef_scale = 1 << resize_coef_bits;
+
+  double scale_x = static_cast<double>(w_in) / w_out;
+  double scale_y = static_cast<double>(h_in) / h_out;
+
+  int* buf = new int[w_out * 2 + h_out * 2];
+
+  int* xofs = buf;          // new int[w];
+  int* yofs = buf + w_out;  // new int[h];
+
+  int16_t* ialpha =
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new short[w * 2];
+  int16_t* ibeta =
+      reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out);  // new short[h * 2];
+
+  float fx = 0.f;
+  float fy = 0.f;
+  int sx = 0;
+  int sy = 0;
+
+#define SATURATE_CAST_SHORT(X)                                               \
+  (int16_t)::std::min(                                                       \
+      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
+      SHRT_MAX);
+  for (int dx = 0; dx < w_out; dx++) {
+    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
+    sx = floor(fx);
+    fx -= sx;
+
+    if (sx < 0) {
+      sx = 0;
+      fx = 0.f;
+    }
+    if (sx >= w_in - 1) {
+      sx = w_in - 2;
+      fx = 1.f;
+    }
+
+    xofs[dx] = sx;
+
+    float a0 = (1.f - fx) * resize_coef_scale;
+    float a1 = fx * resize_coef_scale;
+
+    ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
+    ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
+  }
+  for (int dy = 0; dy < h_out; dy++) {
+    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
+    sy = floor(fy);
+    fy -= sy;
+
+    if (sy < 0) {
+      sy = 0;
+      fy = 0.f;
+    }
+    if (sy >= h_in - 1) {
+      sy = h_in - 2;
+      fy = 1.f;
+    }
+
+    yofs[dy] = sy;
+
+    float b0 = (1.f - fy) * resize_coef_scale;
+    float b1 = fy * resize_coef_scale;
+
+    ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
+    ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
+  }
+#undef SATURATE_CAST_SHORT
+  // loop body
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+  int16_t* rows0 = rowsbuf0;
+  int16_t* rows1 = rowsbuf1;
+
+  int prev_sy1 = -1;
+  for (int dy = 0; dy < h_out; dy++) {
+    int sy = yofs[dy];
+
+    if (sy == prev_sy1) {
+      // hresize one row
+      int16_t* rows0_old = rows0;
+      rows0 = rows1;
+      rows1 = rows0_old;
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+
+        const uint8_t* S1p = S1 + sx;
+        rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
+
+        ialphap += 2;
+      }
+    } else {
+      // hresize two rows
+      const uint8_t* S0 = src + w_in * (sy);
+      const uint8_t* S1 = src + w_in * (sy + 1);
+
+      const int16_t* ialphap = ialpha;
+      int16_t* rows0p = rows0;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+        rows0p[dx] = (S0p[0] * a0 + S0p[1] * a1) >> 4;
+        rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
+
+        ialphap += 2;
+      }
+    }
+
+    prev_sy1 = sy + 1;
+
+    // vresize
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+
+    int16_t* rows0p = rows0;
+    int16_t* rows1p = rows1;
+    uint8_t* dp_ptr = dst + w_out * (dy);
+
+    int cnt = w_out >> 3;
+    int remain = w_out - (cnt << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+
+    for (cnt = w_out >> 3; cnt > 0; cnt--) {
+      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+
+      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+
+      int32x4_t _acc = _v2;
+      _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
+      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+
+      int32x4_t _acc_1 = _v2;
+      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
+      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+
+      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+
+      vst1_u8(dp_ptr, _dout);
+
+      dp_ptr += 8;
+      rows0p += 8;
+      rows1p += 8;
+    }
+    for (; remain; --remain) {
+      // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+      *dp_ptr++ =
+          (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
+                     (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
+                    2);
+    }
+    ibeta += 2;
+  }
+
+  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
+}
+
+void resize_one_channel_uv(const uint8_t* src,
+                           int w_in,
+                           int h_in,
+                           uint8_t* dst,
+                           int w_out,
+                           int h_out) {
+  const int resize_coef_bits = 11;
+  const int resize_coef_scale = 1 << resize_coef_bits;
+
+  double scale_x = static_cast<double>(w_in) / w_out;
+  double scale_y = static_cast<double>(h_in) / h_out;
+
+  int* buf = new int[w_out * 2 + h_out * 2];
+
+  int* xofs = buf;          // new int[w];
+  int* yofs = buf + w_out;  // new int[h];
+
+  int16_t* ialpha =
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
+  int16_t* ibeta = reinterpret_cast<int16_t*>(buf + w_out * 2 +
+                                              h_out);  // new int16_t[h * 2];
+
+  float fx = 0.f;
+  float fy = 0.f;
+  int sx = 0.f;
+  int sy = 0.f;
+
+#define SATURATE_CAST_SHORT(X)                                               \
+  (int16_t)::std::min(                                                       \
+      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
+      SHRT_MAX);
+  for (int dx = 0; dx < w_out / 2; dx++) {
+    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
+    sx = floor(fx);
+    fx -= sx;
+
+    if (sx < 0) {
+      sx = 0;
+      fx = 0.f;
+    }
+    if (sx >= w_in - 1) {
+      sx = w_in - 2;
+      fx = 1.f;
+    }
+
+    xofs[dx] = sx;
+
+    float a0 = (1.f - fx) * resize_coef_scale;
+    float a1 = fx * resize_coef_scale;
+
+    ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
+    ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
+  }
+  for (int dy = 0; dy < h_out; dy++) {
+    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
+    sy = floor(fy);
+    fy -= sy;
+
+    if (sy < 0) {
+      sy = 0;
+      fy = 0.f;
+    }
+    if (sy >= h_in - 1) {
+      sy = h_in - 2;
+      fy = 1.f;
+    }
+
+    yofs[dy] = sy;
+
+    float b0 = (1.f - fy) * resize_coef_scale;
+    float b1 = fy * resize_coef_scale;
+
+    ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
+    ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
+  }
+
+#undef SATURATE_CAST_SHORT
+  // loop body
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+  int16_t* rows0 = rowsbuf0;
+  int16_t* rows1 = rowsbuf1;
+
+  int prev_sy1 = -1;
+  for (int dy = 0; dy < h_out; dy++) {
+    int sy = yofs[dy];
+    if (sy == prev_sy1) {
+      // hresize one row
+      int16_t* rows0_old = rows0;
+      rows0 = rows1;
+      rows1 = rows0_old;
+      const uint8_t* S1 = src + w_in * (sy + 1);
+
+      const int16_t* ialphap = ialpha;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 2; dx++) {
+        int sx = xofs[dx] * 2;
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 2;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
+
+        ialphap += 2;
+      }
+    } else {
+      // hresize two rows
+      const uint8_t* S0 = src + w_in * (sy);
+      const uint8_t* S1 = src + w_in * (sy + 1);
+
+      const int16_t* ialphap = ialpha;
+      int16_t* rows0p = rows0;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 2; dx++) {
+        int sx = xofs[dx] * 2;
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 2;
+        rows0p[tmp] = (S0p[0] * a0 + S0p[2] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
+
+        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[3] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
+        ialphap += 2;
+      }
+    }
+    prev_sy1 = sy + 1;
+
+    // vresize
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+
+    int16_t* rows0p = rows0;
+    int16_t* rows1p = rows1;
+    uint8_t* dp_ptr = dst + w_out * (dy);
+
+    int cnt = w_out >> 3;
+    int remain = w_out - (cnt << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+    for (cnt = w_out >> 3; cnt > 0; cnt--) {
+      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+
+      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+
+      int32x4_t _acc = _v2;
+      _acc = vsraq_n_s32(
+          _acc, _rows0p_sr4_mb0, 16);  // _acc >> 16 + _rows0p_sr4_mb0 >> 16
+      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+
+      int32x4_t _acc_1 = _v2;
+      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);  // _acc >> 2
+      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+
+      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+
+      vst1_u8(dp_ptr, _dout);
+
+      dp_ptr += 8;
+      rows0p += 8;
+      rows1p += 8;
+    }
+    for (; remain; --remain) {
+      // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+      *dp_ptr++ =
+          (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
+                     (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
+                    2);
+    }
+    ibeta += 2;
+  }
+
+  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
+}
+
 void compute_xy(int srcw,
                 int srch,
                 int dstw,
                 int dsth,
+                int num,
                 double scale_x,
                 double scale_y,
                 int* xofs,
@@ -68,7 +655,7 @@ void resize(const uint8_t* src,
   int size = srcw * srch;
   if (srcw == dstw && srch == dsth) {
     if (srcFormat == NV12 || srcFormat == NV21) {
-      size = srcw * (floor(1.5 * srch));
+      size = srcw * (static_cast<int>(1.5 * srch));
     } else if (srcFormat == BGR || srcFormat == RGB) {
       size = 3 * srcw * srch;
     } else if (srcFormat == BGRA || srcFormat == RGBA) {
@@ -77,18 +664,6 @@ void resize(const uint8_t* src,
     memcpy(dst, src, sizeof(uint8_t) * size);
     return;
   }
-  double scale_x = static_cast<double>(srcw / dstw);
-  double scale_y = static_cast<double>(srch / dsth);
-
-  int* buf = new int[dstw * 2 + dsth * 2];
-
-  int* xofs = buf;
-  int* yofs = buf + dstw;
-  int16_t* ialpha = reinterpret_cast<int16_t*>(buf + dstw + dsth);
-  int16_t* ibeta = reinterpret_cast<int16_t*>(buf + 2 * dstw + dsth);
-
-  compute_xy(
-      srcw, srch, dstw, dsth, scale_x, scale_y, xofs, yofs, ialpha, ibeta);
 
   int w_out = dstw;
   int w_in = srcw;
@@ -97,119 +672,99 @@ void resize(const uint8_t* src,
   if (srcFormat == GRAY) {
     num = 1;
   } else if (srcFormat == NV12 || srcFormat == NV21) {
+    nv21_resize(src, dst, srcw, srch, dstw, dsth);
+    return;
     num = 1;
     int hout = static_cast<int>(0.5 * dsth);
     dsth += hout;
   } else if (srcFormat == BGR || srcFormat == RGB) {
+    bgr_resize(src, dst, srcw, srch, dstw, dsth);
+    return;
     w_in = srcw * 3;
     w_out = dstw * 3;
     num = 3;
-
   } else if (srcFormat == BGRA || srcFormat == RGBA) {
     w_in = srcw * 4;
     w_out = dstw * 4;
     num = 4;
   }
+  double scale_x = static_cast<double>(srcw) / dstw;
+  double scale_y = static_cast<double>(srch) / dsth;
+
+  int* buf = new int[dstw * 2 + dsth * 3];
+  int* xofs = buf;
+  int* yofs = buf + dstw;
+  int16_t* ialpha = reinterpret_cast<int16_t*>(buf + dstw + dsth);
+  int16_t* ibeta = reinterpret_cast<int16_t*>(buf + 2 * dstw + dsth);
+
+  compute_xy(
+      srcw, srch, dstw, orih, num, scale_x, scale_y, xofs, yofs, ialpha, ibeta);
 
   int* xofs1 = nullptr;
   int* yofs1 = nullptr;
   int16_t* ialpha1 = nullptr;
   if (orih < dsth) {  // uv
     int tmp = dsth - orih;
-    int w = dstw / 2;
-    xofs1 = new int[w];
+    xofs1 = new int[dstw];
     yofs1 = new int[tmp];
-    ialpha1 = new int16_t[srcw];
-    compute_xy(srcw / 2,
+    ialpha1 = new int16_t[dstw];
+    compute_xy(srcw,
                srch / 2,
-               w,
+               dstw / 2,
                tmp,
+               2,
                scale_x,
                scale_y,
                xofs1,
                yofs1,
                ialpha1,
-               ibeta + orih);
+               ibeta + orih * 2);
   }
   int cnt = w_out >> 3;
   int remain = w_out % 8;
   int32x4_t _v2 = vdupq_n_s32(2);
+  int prev_sy1 = -1;
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
 #pragma omp parallel for
   for (int dy = 0; dy < dsth; dy++) {
-    int16_t* rowsbuf0 = new int16_t[w_out];
-    int16_t* rowsbuf1 = new int16_t[w_out];
     int sy = yofs[dy];
     if (dy >= orih) {
       xofs = xofs1;
       yofs = yofs1;
       ialpha = ialpha1;
+      num = 2;
+      sy = yofs1[dy - orih] + srch;
     }
-    if (sy < 0) {
-      memset(rowsbuf0, 0, sizeof(uint16_t) * w_out);
-      const uint8_t* S1 = src + srcw * (sy + 1);
-      const int16_t* ialphap = ialpha;
-      int16_t* rows1p = rowsbuf1;
-      for (int dx = 0; dx < dstw; dx++) {
-        int sx = xofs[dx] * num;  // num = 4
-        int16_t a0 = ialphap[0];
-        int16_t a1 = ialphap[1];
-
-        const uint8_t* S1pl = S1 + sx;
-        const uint8_t* S1pr = S1 + sx + num;
-        if (sx < 0) {
-          S1pl = S1;
-        }
-        for (int i = 0; i < num; i++) {
-          if (sx < 0) {
-            *rows1p++ = ((*S1pl++) * a1) >> 4;
-          } else {
-            *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
-          }
-        }
-        ialphap += 2;
-      }
-    } else {
-      // hresize two rows
-      const uint8_t* S0 = src + w_in * (sy);
-      const uint8_t* S1 = src + w_in * (sy + 1);
-      const int16_t* ialphap = ialpha;
-      int16_t* rows0p = rowsbuf0;
-      int16_t* rows1p = rowsbuf1;
-      for (int dx = 0; dx < dstw; dx++) {
-        int sx = xofs[dx] * num;  // num = 4
-        int16_t a0 = ialphap[0];
-        int16_t a1 = ialphap[1];
 
-        const uint8_t* S0pl = S0 + sx;
-        const uint8_t* S0pr = S0 + sx + num;
-        const uint8_t* S1pl = S1 + sx;
-        const uint8_t* S1pr = S1 + sx + num;
-        if (sx < 0) {
-          S0pl = S0;
-          S1pl = S1;
-        }
-        for (int i = 0; i < num; i++) {
-          if (sx < 0) {
-            *rows0p = ((*S0pl++) * a1) >> 4;
-            *rows1p = ((*S1pl++) * a1) >> 4;
-            rows0p++;
-            rows1p++;
-          } else {
-            *rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4;
-            *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
-          }
-        }
-        ialphap += 2;
+    // hresize two rows
+    const uint8_t* S0 = src + w_in * (sy);
+    const uint8_t* S1 = src + w_in * (sy + 1);
+    const int16_t* ialphap = ialpha;
+    int16_t* rows0p = rowsbuf0;
+    int16_t* rows1p = rowsbuf1;
+    for (int dx = 0; dx < w_out; dx += num) {
+      int sx = xofs[dx / num];
+      int16_t a0 = ialphap[0];
+      int16_t a1 = ialphap[1];
+      const uint8_t* S0pl = S0 + sx;
+      const uint8_t* S0pr = S0 + sx + num;
+      const uint8_t* S1pl = S1 + sx;
+      const uint8_t* S1pr = S1 + sx + num;
+      for (int i = 0; i < num; i++) {
+        *rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4;
+        *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
       }
+      ialphap += 2;
     }
-    int ind = dy * 2;
-    int16_t b0 = ibeta[ind];
-    int16_t b1 = ibeta[ind + 1];
+
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+    uint8_t* dp_ptr = dst + dy * w_out;
+    rows0p = rowsbuf0;
+    rows1p = rowsbuf1;
     int16x8_t _b0 = vdupq_n_s16(b0);
     int16x8_t _b1 = vdupq_n_s16(b1);
-    uint8_t* dp_ptr = dst + dy * w_out;
-    int16_t* rows0p = rowsbuf0;
-    int16_t* rows1p = rowsbuf1;
     int re_cnt = cnt;
     if (re_cnt > 0) {
 #ifdef __aarch64__
@@ -217,12 +772,12 @@ void resize(const uint8_t* src,
           "1: \n"
           "ld1 {v0.8h}, [%[rows0p]], #16 \n"
           "ld1 {v1.8h}, [%[rows1p]], #16 \n"
-          "orr v6.16b, %w[_v2].16b, %w[_v2].16b \n"
-          "orr v7.16b, %w[_v2].16b, %w[_v2].16b \n"
-          "smull v2.4s, v0.4h, %w[_b0].4h \n"
-          "smull2 v4.4s, v0.8h, %w[_b0].8h \n"
-          "smull v3.4s, v1.4h, %w[_b1].4h \n"
-          "smull2 v5.4s, v1.8h, %w[_b1].8h \n"
+          "orr v6.16b, %[_v2].16b, %[_v2].16b \n"
+          "orr v7.16b, %[_v2].16b, %[_v2].16b \n"
+          "smull v2.4s, v0.4h, %[_b0].4h \n"
+          "smull2 v4.4s, v0.8h, %[_b0].8h \n"
+          "smull v3.4s, v1.4h, %[_b1].4h \n"
+          "smull2 v5.4s, v1.8h, %[_b1].8h \n"
 
           "ssra v6.4s, v2.4s, #16 \n"
           "ssra v7.4s, v4.4s, #16 \n"
@@ -251,10 +806,10 @@ void resize(const uint8_t* src,
           "vorr.s32   q10, q12, q12   \n"
           "vorr.s32   q11, q12, q12   \n"
 
-          "vmull.s16  q0, d2, %[_b0]     \n"
-          "vmull.s16  q1, d3, %[_b0]     \n"
-          "vmull.s16  q2, d6, %[_b1]     \n"
-          "vmull.s16  q3, d7, %[_b1]     \n"
+          "vmull.s16  q0, d2, %e[_b0]     \n"
+          "vmull.s16  q1, d3, %e[_b0]     \n"
+          "vmull.s16  q2, d6, %e[_b1]     \n"
+          "vmull.s16  q3, d7, %e[_b1]     \n"
 
           "vsra.s32   q10, q0, #16    \n"
           "vsra.s32   q11, q1, #16    \n"
@@ -295,14 +850,23 @@ void resize(const uint8_t* src,
                      (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
                     2);
     }
+    ibeta += 2;
+  }
+  if (orih < dsth) {  // uv
+    delete[] xofs1;
+    delete[] yofs1;
+    delete[] ialpha1;
   }
   delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
 }
 // compute xofs, yofs, alpha, beta
 void compute_xy(int srcw,
                 int srch,
                 int dstw,
                 int dsth,
+                int num,
                 double scale_x,
                 double scale_y,
                 int* xofs,
@@ -334,11 +898,10 @@ void compute_xy(int srcw,
       fx = 1.f;
     }
 
-    xofs[dx] = sx;
+    xofs[dx] = sx * num;
 
     float a0 = (1.f - fx) * resize_coef_scale;
     float a1 = fx * resize_coef_scale;
-
     ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
     ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
   }
@@ -346,7 +909,6 @@ void compute_xy(int srcw,
     fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
     sy = floor(fy);
     fy -= sy;
-
     if (sy < 0) {
       sy = 0;
       fy = 0.f;
@@ -355,12 +917,9 @@ void compute_xy(int srcw,
       sy = srch - 2;
       fy = 1.f;
     }
-
     yofs[dy] = sy;
-
     float b0 = (1.f - fy) * resize_coef_scale;
     float b1 = fy * resize_coef_scale;
-
     ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
     ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
   }
diff --git a/lite/utils/cv/image_rotate.cc b/lite/utils/cv/image_rotate.cc
index 98e61fb444aad691d28ae2116dbbd5743e20e481..c87fc4def24220e240168a7114910c7c9ecee5ba 100644
--- a/lite/utils/cv/image_rotate.cc
+++ b/lite/utils/cv/image_rotate.cc
@@ -15,6 +15,7 @@
 #include "lite/utils/cv/image_rotate.h"
 #include <math.h>
 #include <string.h>
+#include "lite/utils/cv/bgr_rotate.h"
 namespace paddle {
 namespace lite {
 namespace utils {
@@ -31,7 +32,8 @@ void ImageRotate::choose(const uint8_t* src,
   if (srcFormat == GRAY) {
     rotate_hwc1(src, dst, srcw, srch, degree);
   } else if (srcFormat == BGR || srcFormat == RGB) {
-    rotate_hwc3(src, dst, srcw, srch, degree);
+    // rotate_hwc3(src, dst, srcw, srch, degree);
+    bgr_rotate_hwc(src, dst, srcw, srch, static_cast<int>(degree));
   } else if (srcFormat == BGRA || srcFormat == RGBA) {
     rotate_hwc4(src, dst, srcw, srch, degree);
   } else {
@@ -679,14 +681,14 @@ void rotate_hwc1_90(const uint8_t* src,
     const uint8_t* inptr7 = inptr6 + w_in;
     for (; j < w_in; j++) {
       uint8_t* outptr = dst + j * w_out + ww - i;
-      *outptr++ = *inptr0++;
-      *outptr++ = *inptr1++;
-      *outptr++ = *inptr2++;
-      *outptr++ = *inptr3++;
-      *outptr++ = *inptr4++;
-      *outptr++ = *inptr5++;
-      *outptr++ = *inptr6++;
       *outptr++ = *inptr7++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr0++;
     }
   }
   ww = w_out - 1;
diff --git a/lite/utils/cv/paddle_image_preprocess.cc b/lite/utils/cv/paddle_image_preprocess.cc
index c46811a046a19a50592097fb987280ad19608193..725a5399b3922243ec40e28958f54009bf5f1e9c 100644
--- a/lite/utils/cv/paddle_image_preprocess.cc
+++ b/lite/utils/cv/paddle_image_preprocess.cc
@@ -30,6 +30,7 @@ namespace cv {
 #define Radians2Degrees(radians) ((radians) * (180 / SK_ScalarPI))
 #define ScalarNearlyZero (1.0f / (1 << 12))
 // init
+__attribute__((visibility("default")))
 ImagePreprocess::ImagePreprocess(ImageFormat srcFormat,
                                  ImageFormat dstFormat,
                                  TransParam param) {
@@ -37,7 +38,8 @@ ImagePreprocess::ImagePreprocess(ImageFormat srcFormat,
   this->dstFormat_ = dstFormat;
   this->transParam_ = param;
 }
-void ImagePreprocess::imageConvert(const uint8_t* src, uint8_t* dst) {
+__attribute__((visibility("default"))) void ImagePreprocess::imageConvert(
+    const uint8_t* src, uint8_t* dst) {
   ImageConvert img_convert;
   img_convert.choose(src,
                      dst,
@@ -47,10 +49,11 @@ void ImagePreprocess::imageConvert(const uint8_t* src, uint8_t* dst) {
                      this->transParam_.ih);
 }
 
-void ImagePreprocess::imageConvert(const uint8_t* src,
-                                   uint8_t* dst,
-                                   ImageFormat srcFormat,
-                                   ImageFormat dstFormat) {
+__attribute__((visibility("default"))) void ImagePreprocess::imageConvert(
+    const uint8_t* src,
+    uint8_t* dst,
+    ImageFormat srcFormat,
+    ImageFormat dstFormat) {
   ImageConvert img_convert;
   img_convert.choose(src,
                      dst,
@@ -60,18 +63,20 @@ void ImagePreprocess::imageConvert(const uint8_t* src,
                      this->transParam_.ih);
 }
 
-void ImagePreprocess::imageResize(const uint8_t* src,
-                                  uint8_t* dst,
-                                  ImageFormat srcFormat,
-                                  int srcw,
-                                  int srch,
-                                  int dstw,
-                                  int dsth) {
+__attribute__((visibility("default"))) void ImagePreprocess::imageResize(
+    const uint8_t* src,
+    uint8_t* dst,
+    ImageFormat srcFormat,
+    int srcw,
+    int srch,
+    int dstw,
+    int dsth) {
   ImageResize img_resize;
   img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth);
 }
 
-void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) {
+__attribute__((visibility("default"))) void ImagePreprocess::imageResize(
+    const uint8_t* src, uint8_t* dst) {
   int srcw = this->transParam_.iw;
   int srch = this->transParam_.ih;
   int dstw = this->transParam_.ow;
@@ -81,17 +86,19 @@ void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) {
   img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth);
 }
 
-void ImagePreprocess::imageRotate(const uint8_t* src,
-                                  uint8_t* dst,
-                                  ImageFormat srcFormat,
-                                  int srcw,
-                                  int srch,
-                                  float degree) {
+__attribute__((visibility("default"))) void ImagePreprocess::imageRotate(
+    const uint8_t* src,
+    uint8_t* dst,
+    ImageFormat srcFormat,
+    int srcw,
+    int srch,
+    float degree) {
   ImageRotate img_rotate;
   img_rotate.choose(src, dst, srcFormat, srcw, srch, degree);
 }
 
-void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst) {
+__attribute__((visibility("default"))) void ImagePreprocess::imageRotate(
+    const uint8_t* src, uint8_t* dst) {
   auto srcw = this->transParam_.ow;
   auto srch = this->transParam_.oh;
   auto srcFormat = this->dstFormat_;
@@ -100,17 +107,19 @@ void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst) {
   img_rotate.choose(src, dst, srcFormat, srcw, srch, degree);
 }
 
-void ImagePreprocess::imageFlip(const uint8_t* src,
-                                uint8_t* dst,
-                                ImageFormat srcFormat,
-                                int srcw,
-                                int srch,
-                                FlipParam flip_param) {
+__attribute__((visibility("default"))) void ImagePreprocess::imageFlip(
+    const uint8_t* src,
+    uint8_t* dst,
+    ImageFormat srcFormat,
+    int srcw,
+    int srch,
+    FlipParam flip_param) {
   ImageFlip img_flip;
   img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param);
 }
 
-void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst) {
+__attribute__((visibility("default"))) void ImagePreprocess::imageFlip(
+    const uint8_t* src, uint8_t* dst) {
   auto srcw = this->transParam_.ow;
   auto srch = this->transParam_.oh;
   auto srcFormat = this->dstFormat_;
@@ -119,24 +128,26 @@ void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst) {
   img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param);
 }
 
-void ImagePreprocess::image2Tensor(const uint8_t* src,
-                                   Tensor* dstTensor,
-                                   ImageFormat srcFormat,
-                                   int srcw,
-                                   int srch,
-                                   LayoutType layout,
-                                   float* means,
-                                   float* scales) {
+__attribute__((visibility("default"))) void ImagePreprocess::image2Tensor(
+    const uint8_t* src,
+    Tensor* dstTensor,
+    ImageFormat srcFormat,
+    int srcw,
+    int srch,
+    LayoutType layout,
+    float* means,
+    float* scales) {
   Image2Tensor img2tensor;
   img2tensor.choose(
       src, dstTensor, srcFormat, layout, srcw, srch, means, scales);
 }
 
-void ImagePreprocess::image2Tensor(const uint8_t* src,
-                                   Tensor* dstTensor,
-                                   LayoutType layout,
-                                   float* means,
-                                   float* scales) {
+__attribute__((visibility("default"))) void ImagePreprocess::image2Tensor(
+    const uint8_t* src,
+    Tensor* dstTensor,
+    LayoutType layout,
+    float* means,
+    float* scales) {
   Image2Tensor img2tensor;
   img2tensor.choose(src,
                     dstTensor,
diff --git a/lite/utils/env.h b/lite/utils/env.h
index 86af8c9e7e0749e75b35bbf23ff4c1d903ad5764..3048c84b42f6f658eaf0c8ee0d08456f53162c37 100644
--- a/lite/utils/env.h
+++ b/lite/utils/env.h
@@ -19,6 +19,9 @@
 #include <iostream>
 #include <string>
 
+#define SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE \
+  "SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE"
+
 namespace paddle {
 namespace lite {
 
diff --git a/lite/utils/logging.h b/lite/utils/logging.h
index c2c999fd70f3eee78c1deaf5ec2c4fea4e4f3fd1..3d97f4dbec1e4973295248c94c4156563dfb4f5d 100644
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
@@ -29,6 +29,7 @@
 #include <cstring>
 #include <string>
 #include "lite/utils/replace_stl/stream.h"
+#include "lite/utils/string.h"
 
 #ifdef LITE_WITH_ANDROID
 #include <android/log.h>
@@ -171,7 +172,7 @@ class VLogMessage {
     if (GLOG_v_int < level_int) {
       return;
     }
-    const char* level = std::to_string(level_int).c_str();
+    const char* level = paddle::lite::to_string(level_int).c_str();
     paddle::lite::gen_log(log_stream_, file, func, lineno, level);
   }
 
diff --git a/lite/utils/replace_stl/stream.cc b/lite/utils/replace_stl/stream.cc
index d821078e366b1ade8b093e08a63829bcf35c1376..37b02d3c50b8ed78bb8335a1618f753f645fd00b 100644
--- a/lite/utils/replace_stl/stream.cc
+++ b/lite/utils/replace_stl/stream.cc
@@ -15,6 +15,7 @@
 #include "lite/utils/replace_stl/stream.h"
 #include <assert.h>
 #include <stdio.h>
+#include "lite/utils/string.h"
 
 #ifdef LITE_ON_TINY_PUBLISH
 
@@ -39,9 +40,9 @@ void ostream::pad(const std::string& text) {
 #ifdef LITE_SHUTDOWN_LOG
 #define ADD_DATA_AS_STRING(data_, obj_)
 #else
-#define ADD_DATA_AS_STRING(data_, obj_)    \
-  std::string text = std::to_string(obj_); \
-  pad(text);                               \
+#define ADD_DATA_AS_STRING(data_, obj_)             \
+  std::string text = paddle::lite::to_string(obj_); \
+  pad(text);                                        \
   data_ = data_ + text;
 
 #endif
@@ -100,6 +101,12 @@ ostream& ostream::operator<<(const unsigned& obj) {
   return *this;
 }
 
+template <>
+ostream& ostream::operator<<(const uint16_t& obj) {
+  ADD_DATA_AS_STRING(data_, obj);
+  return *this;
+}
+
 template <>
 ostream& ostream::operator<<(const unsigned long& obj) {  // NOLINT
   ADD_DATA_AS_STRING(data_, obj);
diff --git a/lite/utils/string.h b/lite/utils/string.h
index d96b2aac20549989afdc730e34af4fc40541329d..5269525b64f473f1018e183613c087886dba97d6 100644
--- a/lite/utils/string.h
+++ b/lite/utils/string.h
@@ -48,7 +48,14 @@ template <typename T>
 static std::string to_string_with_precision(const T& v, const int n = 6) {
   STL::stringstream ss;
   ss.precision(n);
-  // ss << std::fixed << v;
+  ss << v;
+  return ss.str();
+}
+
+template <typename T>
+static std::string to_string(const T& v) {
+  STL::stringstream ss;
+  ss << v;
   return ss.str();
 }
 
diff --git a/mobile/src/operators/bilinear_interp_op.cpp b/mobile/src/operators/bilinear_interp_op.cpp
index 5db21396b07f90f380439139b48dd44918cb1347..8dcf743a066d80692269160bbb863b2887b0cd3d 100644
--- a/mobile/src/operators/bilinear_interp_op.cpp
+++ b/mobile/src/operators/bilinear_interp_op.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef BILINEAR_INTERP_OP
 
 #include "operators/bilinear_interp_op.h"
+#include <vector>
 
 namespace paddle_mobile {
 namespace operators {
@@ -49,6 +50,10 @@ namespace ops = paddle_mobile::operators;
 REGISTER_OPERATOR_CPU(bilinear_interp, ops::BilinearOp);
 #endif
 
+#if PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(bilinear_interp, ops::BilinearOp)
+#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 #endif
 
diff --git a/mobile/src/operators/kernel/cl/bilinear_interp_kernel.cpp b/mobile/src/operators/kernel/cl/bilinear_interp_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..362cf5bb25ac43981aa80ebe6e683d5471fa9d89
--- /dev/null
+++ b/mobile/src/operators/kernel/cl/bilinear_interp_kernel.cpp
@@ -0,0 +1,85 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef BILINEAR_INTERP_OP
+
+#include <operators/kernel/bilinear_interp_kernel.h>
+
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool BilinearInterpKernel<GPU_CL, float>::Init(
+    paddle_mobile::operators::BilinearInterpParam<paddle_mobile::GPU_CL>
+        *param) {
+  this->cl_helper_.AddKernel("bilinear_interp", "bilinear_interp_kernel.cl");
+  return true;
+}
+
+template <>
+void BilinearInterpKernel<GPU_CL, float>::Compute(
+    const paddle_mobile::operators::BilinearInterpParam<paddle_mobile::GPU_CL>
+        &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
+  auto input = param.InputX();
+  cl_mem input_image = input->GetCLImage();
+  auto output = param.Out();
+  cl_mem output_image = output->GetCLImage();
+  float scale_h, scale_w;
+  if (param.AlignCorners()) {
+    scale_h = (input->dims()[2] - 1.0f) / (output->dims()[2] - 1.0f);
+    scale_w = (input->dims()[3] - 1.0f) / (output->dims()[3] - 1.0f);
+  } else {
+    scale_h = input->dims()[2] / static_cast<float>(output->dims()[2]);
+    scale_w = input->dims()[3] / static_cast<float>(output->dims()[3]);
+  }
+  float align_delta = 0.0f;
+  if (!param.AlignCorners() && param.AlignMode() == 0) {
+    align_delta = 0.5f;
+  }
+  int in_dims_h = input->dims()[2];
+  int out_dims_h = output->dims()[2];
+  int in_dims_w = input->dims()[3];
+  int out_dims_w = output->dims()[3];
+
+  cl_int status;
+
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+  CL_CHECK_ERRORS(status)
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+  CL_CHECK_ERRORS(status)
+  status = clSetKernelArg(kernel, 2, sizeof(float), &scale_h);
+  CL_CHECK_ERRORS(status)
+  status = clSetKernelArg(kernel, 3, sizeof(float), &scale_w);
+  CL_CHECK_ERRORS(status)
+  status = clSetKernelArg(kernel, 4, sizeof(int), &in_dims_h);
+  CL_CHECK_ERRORS(status)
+  status = clSetKernelArg(kernel, 5, sizeof(int), &out_dims_h);
+  CL_CHECK_ERRORS(status)
+  status = clSetKernelArg(kernel, 6, sizeof(int), &in_dims_w);
+  CL_CHECK_ERRORS(status)
+  status = clSetKernelArg(kernel, 7, sizeof(int), &out_dims_w);
+  CL_CHECK_ERRORS(status)
+  status = clSetKernelArg(kernel, 8, sizeof(float), &align_delta);
+  CL_CHECK_ERRORS(status)
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status)
+}
+template class BilinearInterpKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..6937c334c809dca340a4dbb69a758ad9238b86d3
--- /dev/null
+++ b/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl
@@ -0,0 +1,82 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void bilinear_interp(__read_only image2d_t input, __write_only image2d_t output,
+                             __private const float scale_h, __private const float scale_w,
+                             __private const int in_dims_h, __private const int out_dims_h,
+                             __private const int in_dims_w, __private const int out_dims_w,
+                             __private const float align_delta) {
+    const int c = get_global_id(0);
+    const int w = get_global_id(1);
+    const int nh = get_global_id(2);
+
+    int2 output_pos;
+    output_pos.x = c * out_dims_w + w;
+    output_pos.y = nh;
+
+    // calculate center pixel's pos
+    int out_n = nh / out_dims_h;
+    int out_h = nh % out_dims_h;
+    float center_w = (w + align_delta)  * scale_w - align_delta;
+    float center_h = (out_h + align_delta) * scale_h - align_delta;
+
+    int floor_w = (int)center_w;
+    int floor_h = (int)center_h;
+    int ceil_w = floor_w + 1;
+    int ceil_h = floor_h + 1;
+
+    if (ceil_w > in_dims_w) {
+        ceil_w = floor_w;
+    }
+    if (ceil_h > in_dims_h) {
+        ceil_h = floor_h;
+    }
+    float wight0_w = center_w - floor_w;
+    float wight0_h = center_h - floor_h;
+    float wight1_w = 1.0 - wight0_w;
+    float wight1_h = 1.0 - wight0_h;
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+    // get left up pixel data
+    int2 left_up;
+    left_up.x = c * in_dims_w + floor_w;
+    left_up.y = out_n * in_dims_h + ceil_h;
+    half4 left_up_data = read_imageh(input, sampler, left_up);
+
+    // get left down pixel data
+    int2 left_down;
+    left_down.x = c * in_dims_w + floor_w;
+    left_down.y = out_n * in_dims_h + floor_h;
+    half4 left_down_data = read_imageh(input, sampler, left_down);
+
+    // get right up pixel data
+    int2 right_up;
+    right_up.x = c * in_dims_w + ceil_w;
+    right_up.y = out_n * in_dims_h + ceil_h;
+    half4 right_up_data = read_imageh(input, sampler, right_up);
+
+    // get right down pixel's data
+    int2 right_down;
+    right_down.x = c * in_dims_w + ceil_w;
+    right_down.y = out_n * in_dims_h + floor_h;
+    half4 right_down_data = read_imageh(input, sampler, right_down);
+
+    // calculate output data
+    half4 data = (left_down_data * wight1_w + right_down_data * wight0_w) * wight1_h
+            + (left_up_data * wight1_w + right_up_data * wight0_w) * wight0_h;
+
+    write_imageh(output, output_pos, data);
+}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
index b7f4d16c3bb54b7f28d379e38724c5de8cf9dd06..916dd9d49fe3b373a5c54f5a1f5fec5c24e91b14 100644
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
@@ -112,6 +112,25 @@ __kernel void channel_mul_d2(__global image2d_t input, __global image2d_t bias,
   write_imageh(outputImage, coords, output);
 }
 
+// c 1 1
+__kernel void channel_mul_d3(__global image2d_t input, __global image2d_t bias,
+                          __write_only image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias;
+  coords_bias.x = x / w;
+  coords_bias.y = 0;
+  half4 in = read_imageh(input, sampler, coords);
+  half4 biase = read_imageh(bias, sampler, coords_bias);
+  half4 output = in * biase;
+  write_imageh(outputImage, coords, output);
+}
+
 __kernel void channel_mul_d4(__global image2d_t input, __global image2d_t bias,
                           __write_only image2d_t outputImage, int w) {
   int x = get_global_id(0);
diff --git a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
index 1772cd275b77901b2dfa389fec1c521cdfc85bac..036ffd5c321a072e66d8748233ca2528cf5a8b86 100644
--- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
@@ -15,7 +15,9 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBNRELU_OP
 
 #include "operators/kernel/conv_add_bn_relu_kernel.h"
+
 #include <cmath>
+
 #include "framework/cl/cl_image.h"
 #include "framework/cl/cl_tool.h"
 #include "operators/kernel/cl/cl-kernel-func/conv_func.h"
@@ -203,8 +205,20 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
     param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
     param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                  cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
+    // std::cout << " input dim " << param->Input()->dims()[0] << "  "
+    //           << param->Input()->dims()[1] << "  " <<
+    //           param->Input()->dims()[2]
+    //           << "  " << param->Input()->dims()[3] << "  " << std::endl;
+    // std::cout << " output dim " << param->Output()->dims()[0] << " "
+    //           << param->Output()->dims()[1] << " " <<
+    //           param->Output()->dims()[2]
+    //           << " " << param->Output()->dims()[3] << " " << std::endl;
+    // std::cout << " filter dim " << param->Filter()->dims()[0] << " "
+    //           << param->Filter()->dims()[1] << " " <<
+    //           param->Filter()->dims()[2]
+    //           << " " << param->Filter()->dims()[3] << " " << std::endl;
+
+    this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, build_options);
     //    }
   } else {
     PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
@@ -222,7 +236,7 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
                             param.NewScale(), param.NewBias());
       break;
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+    // case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
       ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
@@ -232,6 +246,10 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
       DWConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
                       param.NewScale(), param.NewBias());
       break;
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+      SWConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
+                      param.NewScale(), param.NewBias());
+      break;
     default:
       PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                     param.ExecMode());
diff --git a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
index 370934849c08bca2a27411ea80468ec829e064ca..bb27baecd484a75c2be4998205f9e229dc6c49e5 100644
--- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
@@ -99,8 +99,20 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
     param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
     param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                  cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
+    // std::cout << " input dim " << param->Input()->dims()[0] << "  "
+    //           << param->Input()->dims()[1] << "  " <<
+    //           param->Input()->dims()[2]
+    //           << "  " << param->Input()->dims()[3] << "  " << std::endl;
+    // std::cout << " output dim " << param->Output()->dims()[0] << " "
+    //           << param->Output()->dims()[1] << " " <<
+    //           param->Output()->dims()[2]
+    //           << " " << param->Output()->dims()[3] << " " << std::endl;
+    // std::cout << " filter dim " << param->Filter()->dims()[0] << " "
+    //           << param->Filter()->dims()[1] << " " <<
+    //           param->Filter()->dims()[2]
+    //           << " " << param->Filter()->dims()[3] << " " << std::endl;
+
+    this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, build_options);
     //    }
 
   } else if (param->Filter()->dims()[2] == 7 &&
@@ -134,7 +146,7 @@ void ConvAddReluKernel<GPU_CL, float>::Compute(
       WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, param.Bias());
       break;
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
@@ -144,6 +156,9 @@ void ConvAddReluKernel<GPU_CL, float>::Compute(
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
       DWConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
       break;
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+      SWConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
+      break;
     default:
       PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                     param.ExecMode());
diff --git a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
index 02fdfb782e8e052ed3d4206e886bb2d50944a68f..dc71ca5589b5655e3a5fca04448b7b84041942ba 100644
--- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
@@ -157,7 +157,7 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
     param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                  cl_helper_.CLCommandQueue());
 
-    this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
+    this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file, build_options);
     //    }
   } else {
     PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
@@ -174,7 +174,7 @@ void ConvBNReluKernel<GPU_CL, float>::Compute(
                             param.NewScale(), param.NewBias());
       break;
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+    // case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
       ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
@@ -184,6 +184,10 @@ void ConvBNReluKernel<GPU_CL, float>::Compute(
       DWConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
                       param.NewBias());
       break;
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+      SWConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
+                      param.NewBias());
+      break;
     default:
       PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                     param.ExecMode());
diff --git a/mobile/src/operators/kernel/cl/conv_kernel.cpp b/mobile/src/operators/kernel/cl/conv_kernel.cpp
index 0965e5feb200a0c0d4f3489d0e241eb043e7f93f..dff4039fc0628891763988b11e04c3197c4fec7b 100644
--- a/mobile/src/operators/kernel/cl/conv_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_kernel.cpp
@@ -93,8 +93,20 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
     param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
     param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                  cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file);
+    // std::cout << " input dim " << param->Input()->dims()[0] << "  "
+    //           << param->Input()->dims()[1] << "  " <<
+    //           param->Input()->dims()[2]
+    //           << "  " << param->Input()->dims()[3] << "  " << std::endl;
+    // std::cout << " output dim " << param->Output()->dims()[0] << " "
+    //           << param->Output()->dims()[1] << " " <<
+    //           param->Output()->dims()[2]
+    //           << " " << param->Output()->dims()[3] << " " << std::endl;
+    // std::cout << " filter dim " << param->Filter()->dims()[0] << " "
+    //           << param->Filter()->dims()[1] << " " <<
+    //           param->Filter()->dims()[2]
+    //           << " " << param->Filter()->dims()[3] << " " << std::endl;
+
+    this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file);
     //    }
     DLOG << "conv 3x3";
   } else if (param->Filter()->dims()[2] == 7 &&
@@ -120,7 +132,7 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
       WinogradConv3x3<4, 3>(&this->cl_helper_, param);
       break;
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+    // case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
@@ -129,6 +141,9 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
       DWConvAddBnRelu(&this->cl_helper_, param);
       break;
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+      SWConvAddBnRelu(&this->cl_helper_, param);
+      break;
     default:
       PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                     param.ExecMode());
diff --git a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
index ecfc5fbd10bd7ff027d2d731805d63fc86821837..ab1f962c3b867b9cc6431d04876ca40b60367576 100644
--- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
@@ -106,7 +106,21 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {
       param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
       param->Filter()->InitCLImage(cl_helper_.CLContext(),
                                    cl_helper_.CLCommandQueue());
-      this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
+      // std::cout << " input dim " << param->Input()->dims()[0] << "  "
+      //           << param->Input()->dims()[1] << "  "
+      //           << param->Input()->dims()[2] << "  "
+      //           << param->Input()->dims()[3] << "  " << std::endl;
+      // std::cout << " output dim " << param->Output()->dims()[0] << " "
+      //           << param->Output()->dims()[1] << " "
+      //           << param->Output()->dims()[2] << " "
+      //           << param->Output()->dims()[3] << " " << std::endl;
+      // std::cout << " filter dim " << param->Filter()->dims()[0] << " "
+      //           << param->Filter()->dims()[1] << " "
+      //           << param->Filter()->dims()[2] << " "
+      //           << param->Filter()->dims()[3] << " " << std::endl;
+
+      this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file,
+                                 build_options);
     }
     //    }
     DLOG << "conv 3x3";
@@ -126,7 +140,7 @@ void ConvReluKernel<GPU_CL, float>::Compute(
       WinogradConv3x3<4, 3>(&this->cl_helper_, param, true);
       break;
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+    // case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
     case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
       ConvAddBnRelu(&this->cl_helper_, param, true);
@@ -137,6 +151,9 @@ void ConvReluKernel<GPU_CL, float>::Compute(
     case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
       SWConvAddBnRelu(&this->cl_helper_, param, true);
       break;
+    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
+      SWConvAddBnRelu(&this->cl_helper_, param, true);
+      break;
     default:
       PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
                                     param.ExecMode());
diff --git a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
index 37034a01899d8246abfa5dcf419637e643eff924..51a213026b8c9b5f44fc9690e1cf4f6baf2a7276 100644
--- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
@@ -40,6 +40,9 @@ bool ElementwiseMulKernel<GPU_CL, float>::Init(
       // filter 1 72
       DLOG << "init channel_mul_d2";
       this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl");
+    } else if (bias_dim_size == 3) {
+      DLOG << "init channel_mul_d3";
+      this->cl_helper_.AddKernel("channel_mul_d3", "elementwise_mul_kernel.cl");
     } else if (bias_dim_size == 4) {
       DLOG << "init channel_mul_d4";
       this->cl_helper_.AddKernel("channel_mul_d4", "elementwise_mul_kernel.cl");
@@ -140,6 +143,38 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
       CL_CHECK_ERRORS(status);
 
       //    bias->PrintTensor(*bias);
+    } else if (bias_dim_size == 3) {
+      DLOG << "channel_mul_d3";
+      // etc. input  1 72 28 28
+      // filter 1 72   -->  1 1 1 72
+      DLOG << "input->ImageDims():  " << input->ImageDims();
+      DLOG << "bias->ImageDims():  " << bias->ImageDims();
+      DLOG << "out->ImageDims():  " << output->ImageDims();
+
+      DLOG << "channel mul d3";
+      cl_mem input_image = input->GetCLImage();
+      cl_mem bias_image = bias->GetCLImage();
+      cl_mem output_image = output->GetCLImage();
+      int tensor_w = input->dims()[input->dims().size() - 1];
+      status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&input_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&bias_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&output_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 3, sizeof(cl_int),
+                              reinterpret_cast<void *>(&tensor_w));
+      CL_CHECK_ERRORS(status);
+      auto width = input->ImageWidth();
+      auto height = input->ImageHeight();
+      size_t global_work_size[2] = {width, height};
+      status =
+          clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                                 NULL, global_work_size, NULL, 0, NULL, NULL);
+      CL_CHECK_ERRORS(status);
     } else if (bias_dim_size == 4) {
       DLOG << "channel_mul_d4";
       // etc. input  1 72 28 28
@@ -148,7 +183,7 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
       DLOG << "bias->ImageDims():  " << bias->ImageDims();
       DLOG << "out->ImageDims():  " << output->ImageDims();
 
-      DLOG << "channel mul d2";
+      DLOG << "channel mul d4";
       cl_mem input_image = input->GetCLImage();
       cl_mem bias_image = bias->GetCLImage();
       cl_mem output_image = output->GetCLImage();
diff --git a/mobile/src/operators/kernel/cl/transpose2_kernel.cpp b/mobile/src/operators/kernel/cl/transpose2_kernel.cpp
index a40569574af2653f8592ee68f7f9fc2395e969db..248eb3d12e0b87ac812f1ed8f3b26889ce099c2d 100644
--- a/mobile/src/operators/kernel/cl/transpose2_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/transpose2_kernel.cpp
@@ -181,7 +181,7 @@ void Transpose2Compute(const Transpose2Param<GPU_CL> &param, cl_context context,
     }
   }
 
-  output->InitEmptyImage(context, commandQueue, output_tensor->dims());
+  // output->InitEmptyImage(context, commandQueue, output_tensor->dims());
   framework::TensorToCLImage(output_tensor, output, context, commandQueue,
                              kernel1);
   delete (input_tensor);
@@ -197,14 +197,18 @@ void Transpose2Kernel<GPU_CL, float>::Compute(
   const std::vector<int> &axis = param.Axis();
   bool shuffle_channel = IsShuffleChannel(axis);
   if (shuffle_channel) {
+    DLOG << "transpose shuffle_channel .. ";
     ShuffleChannelCompute<float>(param, this->cl_helper_.CLContext(),
                                  this->cl_helper_.CLCommandQueue(), kernel0,
                                  kernel1);
   } else {
+    DLOG << "transpose 2 compute .. ";
     Transpose2Compute<float>(param, this->cl_helper_.CLContext(),
                              this->cl_helper_.CLCommandQueue(), kernel0,
                              kernel1);
   }
+
+  DLOG << "transpose end .. ";
 }
 
 template class Transpose2Kernel<GPU_CL, float>;
diff --git a/mobile/src/operators/nearest_interp_op.cpp b/mobile/src/operators/nearest_interp_op.cpp
index e885ea26adbcc42ed0feeefeb9077d22c734fcb2..8e6c9b86d6557b96bd51b1efc0bca38cdab847d0 100644
--- a/mobile/src/operators/nearest_interp_op.cpp
+++ b/mobile/src/operators/nearest_interp_op.cpp
@@ -27,8 +27,12 @@ void NearestInterpolationOp<DeviceType, T>::InferShape() const {
   auto dim_x = this->param_.InputX()->dims();  // NCHW format
   DLOG << "dim_x :" << dim_x;
 
+  bool ignore_scale = false;
   int out_h = this->param_.OutH();
   int out_w = this->param_.OutW();
+  if (out_h > 0 && out_w > 0) {
+    ignore_scale = true;
+  }
   PADDLE_MOBILE_ENFORCE(dim_x.size() == 4, "X's dimension must be 4");
 
   if (this->param_.InputOutPutSize() != nullptr) {
@@ -40,7 +44,7 @@ void NearestInterpolationOp<DeviceType, T>::InferShape() const {
   }
 
   DLOG << "this->param_.HasScale(): " << this->param_.HasScale();
-  if (this->param_.HasScale()) {
+  if (this->param_.HasScale() && !ignore_scale) {
     const float scale = this->param_.Scale();
     DLOG << "scale_:  " << scale;
     std::vector<int64_t> dim_out({dim_x[0], dim_x[1],
diff --git a/mobile/src/operators/op_param.h b/mobile/src/operators/op_param.h
index f588b9fc79e1fe0a69dd00afe6419e0ef1e2aa5b..8ecb1e2d25ed1f1a463993c19afd37b6d10fae1d 100644
--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
@@ -3081,12 +3081,16 @@ class BilinearInterpParam : public OpParam {
     out_ = OutFrom<GType>(outputs, *scope);
     out_h_ = GetAttr<int>("out_h", attrs);
     out_w_ = GetAttr<int>("out_w", attrs);
+    align_corners = GetAttr<bool>("align_corners", attrs);
+    align_mode = GetAttr<int>("align_mode", attrs);
   }
   const GType *InputX() const { return input_x_; }
   const GType *InputOutPutSize() const { return input_outsize_; }
   GType *Out() const { return out_; }
   int OutH() const { return out_h_; }
   int OutW() const { return out_w_; }
+  bool AlignCorners() const { return align_corners; }
+  int AlignMode() const { return align_mode; }
 
  private:
   GType *input_x_;
@@ -3094,6 +3098,8 @@ class BilinearInterpParam : public OpParam {
   GType *out_;
   int out_h_;
   int out_w_;
+  bool align_corners;
+  int align_mode;
 };
 #endif
 
diff --git a/mobile/test/net/test_mobilenet_GPU.cpp b/mobile/test/net/test_mobilenet_GPU.cpp
index 54db0f123c96e3005844b15e53d1d4a5a3bffa58..fdf1bf3158b9b84a2b5c9dad2e75749514e3fd24 100644
--- a/mobile/test/net/test_mobilenet_GPU.cpp
+++ b/mobile/test/net/test_mobilenet_GPU.cpp
@@ -17,47 +17,128 @@ limitations under the License. */
 #include "../test_helper.h"
 #include "../test_include.h"
 
-int main() {
+int main(int argc, char **argv) {
+  // init input args
+  string model_dir = g_mobilenet;
+  int64_t N = 1;
+  int64_t C = 3;
+  int64_t H = 224;
+  int64_t W = 224;
+  int repeats = 10;
+  int warmup = 10;
+  int print_output_elem = 0;
+
+  std::cout << "argc:" << argc << std::endl;
+  if (argc > 1 && argc < 9) {
+    std::cout << "usage:" << argv[0] << "\n"
+              << " <model_dir>\n"
+              << " <input_n>\n"
+              << " <input_c>\n"
+              << " <input_h>\n"
+              << " <input_w>\n"
+              << " <repeats>\n"
+              << " <warmup>\n"
+              << " <print_output>"
+              << std::endl;
+    return 0;
+  }
+
+  if (argc >= 9) {
+    model_dir = argv[1];
+    N = atoi(argv[2]);
+    C = atoi(argv[3]);
+    H = atoi(argv[4]);
+    W = atoi(argv[5]);
+    repeats = atoi(argv[6]);
+    warmup = atoi(argv[7]);
+    print_output_elem = atoi(argv[8]);
+  }
+
+  std::cout << "input shape(NCHW):"
+            << N << " "
+            << C << " "
+            << H << " "
+            << W << std::endl;
+  std::cout << "repeats:" << repeats << std::endl;
+  std::cout << "model_dir:" << model_dir << std::endl;
+
   paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
   //    paddle_mobile.SetThreadNum(4);
-  auto time1 = paddle_mobile::time();
+  auto load_start = paddle_mobile::time();
 #ifdef PADDLE_MOBILE_CL
   paddle_mobile.SetCLPath("/data/local/tmp/bin");
 #endif
 
-  //  auto isok = paddle_mobile.Load(
-  //      std::string(g_mobilenet_vision) + "/vision_mobilenet_model",
-  //      std::string(g_mobilenet_vision) + "/vision_mobilenet_params", true);
+  auto load_model_status = paddle_mobile.Load(std::string(model_dir), true);
+  if (!load_model_status) {
+    std::cout << "failed to load model from:" << model_dir << std::endl;
+    return 0;
+  }
 
-  auto isok = paddle_mobile.Load(std::string(g_mobilenet), true);
-  if (isok) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
-              << std::endl;
+  auto load_end = paddle_mobile::time();
+  std::cout << "load cost:"
+            << paddle_mobile::time_diff(load_start, load_end)
+            << " ms"  << std::endl;
 
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 224, 224};
-    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+  // input tensor
+  std::vector<float> input;
+  std::vector<int64_t> dims{N, C, H, W};
+  GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
 
-    std::vector<float> vec_result = paddle_mobile.Predict(input, dims);
+  // warmup
+  std::vector<float> vec_result = paddle_mobile.Predict(input, dims);
+  for (int widx = 0; widx < warmup; ++widx) {
+    paddle_mobile.Predict(input, dims);
+  }
 
-    auto time3 = paddle_mobile::time();
-    int max = 1;
-    for (int i = 0; i < max; ++i) {
-      vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = paddle_mobile::time();
+  // benchmark
+  float sum_duration = 0.0f;
+  float min_duration = 1e5f;
+  float max_duration = 1e-5f;
+  float ave_duration = -1;
+  for (int ridx = 0; ridx < repeats; ++ridx) {
+    auto start = paddle_mobile::time();
+    vec_result = paddle_mobile.Predict(input, dims);
+    auto end = paddle_mobile::time();
+    auto duration = paddle_mobile::time_diff(start, end);
+    sum_duration += duration;
+    min_duration = (duration > min_duration) ? min_duration : duration;
+    max_duration = (duration < max_duration) ? max_duration : duration;
+    std::cout << "ridx:" << ridx + 1 << "/" << repeats
+              << " " << duration << " ms" << std::endl;
+  }
 
-    std::cout << "predict cost :"
-              << paddle_mobile::time_diff(time3, time4) / max << "ms"
-              << std::endl;
-    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
+  // benchmark result
+  ave_duration = sum_duration / static_cast<float>(repeats);
+
+  // output result
+  float output_sum = 0;
+  float output_ave = -1;
+  for (size_t oidx = 0; oidx < vec_result.size(); ++oidx) {
+    output_sum += vec_result[oidx];
+    if (print_output_elem) {
+      std::cout << "out_idx:" << oidx << " " << vec_result[oidx] << std::endl;
+    }
   }
+  output_ave = output_sum / static_cast<float>(vec_result.size());
+  std::vector<float>::iterator biggest =
+      std::max_element(std::begin(vec_result), std::end(vec_result));
 
-  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
+  // summary
+  std::cout << "===== predict benchmark ====" << std::endl
+            << "run repeats:" << repeats << std::endl
+            << "sum_duration:" << sum_duration << " ms" << std::endl
+            << "ave_duration:" << ave_duration << " ms" << std::endl
+            << "max_duration:" << max_duration << " ms" << std::endl
+            << "min_duration:" << min_duration << " ms" << std::endl
+            << "\n===== predict result ====" << std::endl
+            << "output_sum:" << output_sum << std::endl
+            << "output_ave:" << output_ave << std::endl
+            << "output_size:" << vec_result.size() << std::endl
+            << "Max element is " << *biggest << " at position "
+            << std::distance(std::begin(vec_result), biggest) << std::endl
+            << "Note: 如果结果Nan请查看:"
+               " test/images/g_test_image_1x3x224x224_banana "
                "是否存在?"
             << std::endl;
   return 0;
diff --git a/mobile/test/net/test_net_performance.cpp b/mobile/test/net/test_net_performance.cpp
index 95e72ea7a77d38f07abd391326120b136b4cc499..ac4c71588b77332a8fe35a946da63d79becd5119 100644
--- a/mobile/test/net/test_net_performance.cpp
+++ b/mobile/test/net/test_net_performance.cpp
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <unistd.h>
 #include <fstream>
 #include <iostream>
 #include <string>
 #include "../test_helper.h"
 #include "../test_include.h"
-
 void test(int argc, char *argv[]);
 
 int main(int argc, char *argv[]) {
@@ -175,6 +175,7 @@ void test(int argc, char *argv[]) {
         auto time7 = time();
         paddle_mobile.Predict();
         auto time8 = time();
+        usleep(1000 * quantification_fold);
         const double diff_time_single = time_diff(time7, time8);
         max_time = fmax(diff_time_single, max_time);
         min_time = fmin(diff_time_single, min_time);
diff --git a/mobile/tools/build.sh b/mobile/tools/build.sh
index 741e6a590e685a0f723f364336ac1dc6061fe0ba..3dc579ecf09c20028d8f845876d35497c12fa35b 100755
--- a/mobile/tools/build.sh
+++ b/mobile/tools/build.sh
@@ -130,7 +130,7 @@ build_for_arm_linux() {
             -B"../build/release/arm-linux" \
             -DCMAKE_BUILD_TYPE="${MODE}" \
             -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \
-            -DCMAKE_CXX_FLAGS="-std=c++14 -mcpu=cortex-a53 -mtune=cortex-a53 -ftree-vectorize -funsafe-math-optimizations  -pipe -mlittle-endian " \
+            -DCMAKE_CXX_FLAGS=" " \
             -DNET="${NETS}" \
             -D"V7"=true
     else
@@ -138,7 +138,7 @@ build_for_arm_linux() {
             -B"../build/release/arm-linux" \
             -DCMAKE_BUILD_TYPE="${MODE}" \
             -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \
-            -DCMAKE_CXX_FLAGS="-std=c++14 -mcpu=cortex-a53 -mtune=cortex-a53 -ftree-vectorize -funsafe-math-optimizations -pipe -mlittle-endian " \
+            -DCMAKE_CXX_FLAGS=" " \
             -DNET="${NETS}" \
             -D"V7"=true
     fi